603 files changed, 30702 insertions, 6084 deletions
diff --git a/test/Transforms/ADCE/delete-profiling-calls-to-constant.ll b/test/Transforms/ADCE/delete-profiling-calls-to-constant.ll
index a61e8f8caccb..804b3dd67f2a 100644
--- a/test/Transforms/ADCE/delete-profiling-calls-to-constant.ll
+++ b/test/Transforms/ADCE/delete-profiling-calls-to-constant.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -adce | FileCheck %s
-; RUN: opt < %s -passes=adce | FileCheck %s
+; RUN: opt < %s -adce -S | FileCheck %s
+; RUN: opt < %s -passes=adce -S | FileCheck %s
 
 ; Verify that a call to instrument a constant is deleted.
 
@@ -7,7 +7,7 @@
 @__profd_foo = private global { i64, i64, i64*, i8*, i8*, i32, [1 x i16] } { i64 6699318081062747564, i64 0, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_foo, i32 0, i32 0), i8* bitcast (i32 ()* @foo to i8*), i8* null, i32 1, [1 x i16] [i16 1] }, section "__llvm_prf_data", align 8
 
 define i32 @foo() {
-; CHECK-NOT: __llvm_profile_instrument_target
+; CHECK-NOT: call void @__llvm_profile_instrument_target
 entry:
   tail call void @__llvm_profile_instrument_target(i64 ptrtoint (i32 (i32)* @bar to i64), i8* bitcast ({ i64, i64, i64*, i8*, i8*, i32, [1 x i16] }* @__profd_foo to i8*), i32 0)
   %call = tail call i32 @bar(i32 21)
diff --git a/test/Transforms/AddDiscriminators/basic.ll b/test/Transforms/AddDiscriminators/basic.ll
index 801eda2b0665..a781c0d409bc 100644
--- a/test/Transforms/AddDiscriminators/basic.ll
+++ b/test/Transforms/AddDiscriminators/basic.ll
@@ -58,5 +58,5 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 ; CHECK: ![[FOO:[0-9]+]] = distinct !DISubprogram(name: "foo"
 ; CHECK: ![[BLOCK:[0-9]+]] = distinct !DILexicalBlock(scope: ![[FOO]],{{.*}} line: 3)
 ; CHECK: ![[THEN]] = !DILocation(line: 3, scope: ![[BLOCKFILE:[0-9]+]])
-; CHECK: ![[BLOCKFILE]] = !DILexicalBlockFile(scope: ![[BLOCK]],{{.*}} discriminator: 1)
+; CHECK: ![[BLOCKFILE]] = !DILexicalBlockFile(scope: ![[BLOCK]],{{.*}} discriminator: 2)
 ; CHECK: ![[END]] = !DILocation(line: 4, scope: ![[FOO]])
diff --git a/test/Transforms/AddDiscriminators/call-nested.ll b/test/Transforms/AddDiscriminators/call-nested.ll
index 481d6f260047..4d5145abafe1 100644
--- a/test/Transforms/AddDiscriminators/call-nested.ll
+++ b/test/Transforms/AddDiscriminators/call-nested.ll
@@ -47,4 +47,4 @@ attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-
 !14 = !DILocation(line: 4, column: 3, scope: !4)
 
 ; CHECK: ![[CALL2]] = !DILocation(line: 4, column: 10, scope: ![[CALL2BLOCK:[0-9]+]])
-; CHECK: ![[CALL2BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 1)
+; CHECK: ![[CALL2BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 2)
diff --git a/test/Transforms/AddDiscriminators/call.ll b/test/Transforms/AddDiscriminators/call.ll
index 847a6ad4dc3a..49aca5a488f5 100644
--- a/test/Transforms/AddDiscriminators/call.ll
+++ b/test/Transforms/AddDiscriminators/call.ll
@@ -5,7 +5,7 @@
 ; #1 void bar();
 ; #2
 ; #3 void foo() {
-; #4  bar();bar()/*discriminator 1*/;bar()/*discriminator 2*/;
+; #4  bar();bar()/*discriminator 2*/;bar()/*discriminator 4*/;
 ; #5 }
 
 ; Function Attrs: uwtable
@@ -14,8 +14,8 @@ define void @_Z3foov() #0 !dbg !4 {
 ; CHECK:  call void @_Z3barv(), !dbg ![[CALL0:[0-9]+]]
   %a = alloca [100 x i8], align 16
   %b = bitcast [100 x i8]* %a to i8*
-  call void @llvm.lifetime.start(i64 100, i8* %b), !dbg !11
-  call void @llvm.lifetime.end(i64 100, i8* %b), !dbg !11
+  call void @llvm.lifetime.start.p0i8(i64 100, i8* %b), !dbg !11
+  call void @llvm.lifetime.end.p0i8(i64 100, i8* %b), !dbg !11
   call void @_Z3barv(), !dbg !11
 ; CHECK:  call void @_Z3barv(), !dbg ![[CALL1:[0-9]+]]
   call void @_Z3barv(), !dbg !12
@@ -24,8 +24,8 @@ define void @_Z3foov() #0 !dbg !4 {
 }
 
 declare void @_Z3barv() #1
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind argmemonly
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind argmemonly
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind argmemonly
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind argmemonly
 
 attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
@@ -49,6 +49,6 @@ attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-
 !13 = !DILocation(line: 5, column: 1, scope: !4)
 
 ; CHECK: ![[CALL1]] = !DILocation(line: 4, column: 9, scope: ![[CALL1BLOCK:[0-9]+]])
-; CHECK: ![[CALL1BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 1)
+; CHECK: ![[CALL1BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 2)
 ; CHECK: ![[CALL2]] = !DILocation(line: 4, column: 15, scope: ![[CALL2BLOCK:[0-9]+]])
-; CHECK: ![[CALL2BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 2)
+; CHECK: ![[CALL2BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 4)
diff --git a/test/Transforms/AddDiscriminators/diamond.ll b/test/Transforms/AddDiscriminators/diamond.ll
index b3afe7285472..307e95f41e18 100644
--- a/test/Transforms/AddDiscriminators/diamond.ll
+++ b/test/Transforms/AddDiscriminators/diamond.ll
@@ -10,7 +10,7 @@
 ; #6 }
 
 ; bar(5):     discriminator 0
-; bar(3):     discriminator 1
+; bar(3):     discriminator 2
 
 ; Function Attrs: uwtable
 define void @_Z3fooi(i32 %i) #0 !dbg !4 {
@@ -69,4 +69,4 @@ attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-
 !20 = !DILocation(line: 6, column: 1, scope: !4)
 
 ; CHECK: ![[ELSE]] = !DILocation(line: 5, column: 18, scope: ![[ELSEBLOCK:[0-9]+]])
-; CHECK: ![[ELSEBLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 1)
+; CHECK: ![[ELSEBLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 2)
diff --git a/test/Transforms/AddDiscriminators/first-only.ll b/test/Transforms/AddDiscriminators/first-only.ll
index 1bd8dae5d05c..dd2117a5b187 100644
--- a/test/Transforms/AddDiscriminators/first-only.ll
+++ b/test/Transforms/AddDiscriminators/first-only.ll
@@ -69,7 +69,7 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 !12 = !DILocation(line: 3, scope: !13)
 
 !13 = distinct !DILexicalBlock(line: 3, column: 0, file: !1, scope: !11)
-; CHECK: !DILexicalBlockFile(scope: ![[BLOCK2:[0-9]+]],{{.*}} discriminator: 1)
+; CHECK: !DILexicalBlockFile(scope: ![[BLOCK2:[0-9]+]],{{.*}} discriminator: 2)
 
 !14 = !DILocation(line: 4, scope: !13)
 ; CHECK: ![[BLOCK2]] = distinct !DILexicalBlock(scope: ![[BLOCK1]],{{.*}} line: 3)
diff --git a/test/Transforms/AddDiscriminators/inlined.ll b/test/Transforms/AddDiscriminators/inlined.ll
index 2e8ea97348d0..226e903ee212 100644
--- a/test/Transforms/AddDiscriminators/inlined.ll
+++ b/test/Transforms/AddDiscriminators/inlined.ll
@@ -62,8 +62,8 @@ attributes #3 = { nounwind readnone }
 !12 = distinct !DISubprogram(name: "g", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, variables: !2)
 !13 = distinct !DILocation(line: 1, column: 17, scope: !14)
 ; CHECK: ![[BF:.*]] = !DILexicalBlockFile(scope: ![[LB1:[0-9]+]],
-; CHECK-SAME:                             discriminator: 1)
-!14 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 1)
+; CHECK-SAME:                             discriminator: 2)
+!14 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 2)
 ; CHECK: ![[LB1]] = distinct !DILexicalBlock(scope: ![[LB2:[0-9]+]],
 ; CHECK-SAME:                                line: 1, column: 16)
 !15 = distinct !DILexicalBlock(scope: !16, file: !1, line: 1, column: 16)
diff --git a/test/Transforms/AddDiscriminators/memcpy-discriminator.ll b/test/Transforms/AddDiscriminators/memcpy-discriminator.ll
new file mode 100644
index 000000000000..00642d29502e
--- /dev/null
+++ b/test/Transforms/AddDiscriminators/memcpy-discriminator.ll
@@ -0,0 +1,104 @@
+; RUN: opt < %s -add-discriminators -sroa -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Test case obtained from the following C code:
+
+; struct A {
+;  int field1;
+;  short field2;
+; };
+;
+; struct B {
+;   struct A field1;
+;   int field2;
+; };
+;
+;
+; extern struct B g_b;
+; extern int bar(struct B b, int c);
+;
+; int foo(int cond) {
+;   int result = cond ? bar(g_b, 33) : 42;
+;   return result;
+; }
+
+; In this test, global variable g_b is passed by copy to function bar. That
+; copy is located on the stack (see alloca %g_b.coerce), and it is initialized
+; by a memcpy call.
+;
+; SROA would split alloca %g_b.coerce into two (smaller disjoint) slices:
+; slice [0,8) and slice [8, 12). Users of the original alloca are rewritten
+; as users of the new alloca slices.
+; In particular, the memcpy is rewritten by SROA as two load/store pairs.
+;
+; Later on, mem2reg successfully promotes the new alloca slices to registers,
+; and loads %3 and %5 are made redundant by the loads obtained from the memcpy
+; intrinsic expansion.
+;
+; If pass AddDiscriminators doesn't assign a discriminator to the intrinsic
+; memcpy call, then the loads obtained from the memcpy expansion would not have
+; a correct discriminator.
+;
+; This test checks that the two new loads inserted by SROA in %cond.true
+; correctly reference a debug location with a non-zero discriminator. This test
+; also checks that the same discriminator is used by all instructions from
+; basic block %cond.true.
+
+%struct.B = type { %struct.A, i32 }
+%struct.A = type { i32, i16 }
+
+@g_b = external global %struct.B, align 4
+
+define i32 @foo(i32 %cond) #0 !dbg !5 {
+entry:
+  %g_b.coerce = alloca { i64, i32 }, align 4
+  %tobool = icmp ne i32 %cond, 0, !dbg !7
+  br i1 %tobool, label %cond.true, label %cond.end, !dbg !7
+
+cond.true:
+; CHECK-LABEL: cond.true:
+; CHECK:       load i64, {{.*}}, !dbg ![[LOC:[0-9]+]]
+; CHECK-NEXT:  load i32, {{.*}}, !dbg ![[LOC]]
+; CHECK-NEXT:  %call = call i32 @bar({{.*}}), !dbg ![[LOC]]
+; CHECK-NEXT:  br label %cond.end, !dbg ![[BR_LOC:[0-9]+]]
+
+; CHECK-DAG: ![[LOC]] = !DILocation(line: 16, column: 23, scope: ![[SCOPE:[0-9]+]])
+; CHECK-DAG: ![[SCOPE]] = !DILexicalBlockFile({{.*}}, discriminator: 2)
+; CHECK-DAG: ![[BR_LOC]] = !DILocation(line: 16, column: 16, scope: ![[SCOPE]])
+
+  %0 = bitcast { i64, i32 }* %g_b.coerce to i8*, !dbg !8
+  %1 = bitcast %struct.B* @g_b to i8*, !dbg !8
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 12, i32 4, i1 false), !dbg !8
+  %2 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %g_b.coerce, i32 0, i32 0, !dbg !8
+  %3 = load i64, i64* %2, align 4, !dbg !8
+  %4 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %g_b.coerce, i32 0, i32 1, !dbg !8
+  %5 = load i32, i32* %4, align 4, !dbg !8
+  %call = call i32 @bar(i64 %3, i32 %5, i32 33), !dbg !8
+  br label %cond.end, !dbg !7
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond1 = phi i32 [ %call, %cond.true ], [ 42, %entry ], !dbg !7
+  ret i32 %cond1, !dbg !9
+}
+
+declare i32 @bar(i64, i32, i32)
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1
+
+attributes #0 = { noinline nounwind uwtable }
+attributes #1 = { argmemonly nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "test.c", directory: ".")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 15, type: !6, isLocal: false, isDefinition: true, scopeLine: 15, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!6 = !DISubroutineType(types: !2)
+!7 = !DILocation(line: 16, column: 16, scope: !5)
+!8 = !DILocation(line: 16, column: 23, scope: !5)
+!9 = !DILocation(line: 17, column: 3, scope: !5)
diff --git a/test/Transforms/AddDiscriminators/multiple.ll b/test/Transforms/AddDiscriminators/multiple.ll
index 387689caddff..b4c353cf00f1 100644
--- a/test/Transforms/AddDiscriminators/multiple.ll
+++ b/test/Transforms/AddDiscriminators/multiple.ll
@@ -67,6 +67,6 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 !12 = !DILocation(line: 4, scope: !4)
 
 ; CHECK: ![[THEN]] = !DILocation(line: 3, scope: ![[THENBLOCK:[0-9]+]])
-; CHECK: ![[THENBLOCK]] = !DILexicalBlockFile(scope: ![[SCOPE:[0-9]+]],{{.*}} discriminator: 1)
+; CHECK: ![[THENBLOCK]] = !DILexicalBlockFile(scope: ![[SCOPE:[0-9]+]],{{.*}} discriminator: 2)
 ; CHECK: ![[ELSE]] = !DILocation(line: 3, scope: ![[ELSEBLOCK:[0-9]+]])
-; CHECK: ![[ELSEBLOCK]] = !DILexicalBlockFile(scope: ![[SCOPE]],{{.*}} discriminator: 2)
+; CHECK: ![[ELSEBLOCK]] = !DILexicalBlockFile(scope: ![[SCOPE]],{{.*}} discriminator: 4)
diff --git a/test/Transforms/AddDiscriminators/oneline.ll b/test/Transforms/AddDiscriminators/oneline.ll
index aa52ae42ee47..724574a24ddf 100644
--- a/test/Transforms/AddDiscriminators/oneline.ll
+++ b/test/Transforms/AddDiscriminators/oneline.ll
@@ -7,9 +7,9 @@
 ; #3 }
 
 ; i == 3:     discriminator 0
-; i == 5:     discriminator 1
-; return 100: discriminator 2
-; return 99:  discriminator 3
+; i == 5:     discriminator 2
+; return 100: discriminator 4
+; return 99:  discriminator 6
 
 define i32 @_Z3fooi(i32 %i) #0 !dbg !4 {
   %1 = alloca i32, align 4
@@ -91,11 +91,11 @@ attributes #1 = { nounwind readnone }
 ; CHECK: ![[F:.*]] = distinct !DISubprogram(name: "foo",
 ; CHECK: ![[IF:.*]] = distinct !DILexicalBlock(scope: ![[F]],{{.*}}line: 2, column: 7)
 ; CHECK: ![[THEN1]] = !DILocation(line: 2, column: 17, scope: ![[THENBLOCK:[0-9]+]])
-; CHECK: ![[THENBLOCK]] = !DILexicalBlockFile(scope: ![[IF]],{{.*}} discriminator: 1)
+; CHECK: ![[THENBLOCK]] = !DILexicalBlockFile(scope: ![[IF]],{{.*}} discriminator: 2)
 ; CHECK: ![[THEN2]] = !DILocation(line: 2, column: 19, scope: ![[THENBLOCK]])
 ; CHECK: ![[THEN3]] = !DILocation(line: 2, column: 7, scope: ![[BRBLOCK:[0-9]+]])
-; CHECK: ![[BRBLOCK]] = !DILexicalBlockFile(scope: ![[F]],{{.*}} discriminator: 1)
+; CHECK: ![[BRBLOCK]] = !DILexicalBlockFile(scope: ![[F]],{{.*}} discriminator: 2)
 ; CHECK: ![[ELSE]] = !DILocation(line: 2, column: 25, scope: ![[ELSEBLOCK:[0-9]+]])
-; CHECK: ![[ELSEBLOCK]] = !DILexicalBlockFile(scope: ![[IF]],{{.*}} discriminator: 2)
+; CHECK: ![[ELSEBLOCK]] = !DILexicalBlockFile(scope: ![[IF]],{{.*}} discriminator: 4)
 ; CHECK: ![[COMBINE]] = !DILocation(line: 2, column: 42, scope: ![[COMBINEBLOCK:[0-9]+]])
-; CHECK: ![[COMBINEBLOCK]] = !DILexicalBlockFile(scope: ![[IF]],{{.*}} discriminator: 3)
+; CHECK: ![[COMBINEBLOCK]] = !DILexicalBlockFile(scope: ![[IF]],{{.*}} discriminator: 6)
diff --git a/test/Transforms/ArgumentPromotion/2008-07-02-array-indexing.ll b/test/Transforms/ArgumentPromotion/2008-07-02-array-indexing.ll
index 267a6c045974..fac84d092df3 100644
--- a/test/Transforms/ArgumentPromotion/2008-07-02-array-indexing.ll
+++ b/test/Transforms/ArgumentPromotion/2008-07-02-array-indexing.ll
@@ -1,25 +1,30 @@
-; RUN: opt < %s -argpromotion -S > %t
-; RUN: cat %t | grep "define.*@callee(.*i32\*"
+; RUN: opt < %s -argpromotion -S | FileCheck %s
 ; PR2498
 
 ; This test tries to convince argpromotion about promoting the load from %A + 2,
 ; because there is a load of %A in the entry block
 define internal i32 @callee(i1 %C, i32* %A) {
+; CHECK-LABEL: define internal i32 @callee(
+; CHECK: i1 %C, i32* %A)
 entry:
-        ; Unconditonally load the element at %A
-        %A.0 = load i32, i32* %A
-        br i1 %C, label %T, label %F
+  ; Unconditonally load the element at %A
+  %A.0 = load i32, i32* %A
+  br i1 %C, label %T, label %F
+
 T:
-        ret i32 %A.0
+  ret i32 %A.0
+
 F:
-        ; Load the element at offset two from %A. This should not be promoted!
-        %A.2 = getelementptr i32, i32* %A, i32 2
-        %R = load i32, i32* %A.2
-        ret i32 %R
+  ; Load the element at offset two from %A. This should not be promoted!
+  %A.2 = getelementptr i32, i32* %A, i32 2
+  %R = load i32, i32* %A.2
+  ret i32 %R
 }
 
 define i32 @foo() {
+; CHECK-LABEL: define i32 @foo
         %X = call i32 @callee(i1 false, i32* null)             ; <i32> [#uses=1]
+; CHECK: call i32 @callee(i1 false, i32* null)
         ret i32 %X
 }
 
diff --git a/test/Transforms/ArgumentPromotion/aggregate-promote.ll b/test/Transforms/ArgumentPromotion/aggregate-promote.ll
index 3f521bace7f3..b0bab7784edb 100644
--- a/test/Transforms/ArgumentPromotion/aggregate-promote.ll
+++ b/test/Transforms/ArgumentPromotion/aggregate-promote.ll
@@ -1,24 +1,31 @@
-; RUN: opt < %s -argpromotion -instcombine -S | not grep load
-target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
-%QuadTy = type { i32, i32, i32, i32 }
-@G = constant %QuadTy {
-    i32 0, 
-    i32 0, 
-    i32 17, 
-    i32 25 }            ; <%QuadTy*> [#uses=1]
+%T = type { i32, i32, i32, i32 }
+@G = constant %T { i32 0, i32 0, i32 17, i32 25 }
 
-define internal i32 @test(%QuadTy* %P) {
-        %A = getelementptr %QuadTy, %QuadTy* %P, i64 0, i32 3            ; <i32*> [#uses=1]
-        %B = getelementptr %QuadTy, %QuadTy* %P, i64 0, i32 2            ; <i32*> [#uses=1]
-        %a = load i32, i32* %A               ; <i32> [#uses=1]
-        %b = load i32, i32* %B               ; <i32> [#uses=1]
-        %V = add i32 %a, %b             ; <i32> [#uses=1]
-        ret i32 %V
+define internal i32 @test(%T* %p) {
+; CHECK-LABEL: define internal i32 @test(
+; CHECK: i32 %{{.*}}, i32 %{{.*}})
+entry:
+  %a.gep = getelementptr %T, %T* %p, i64 0, i32 3
+  %b.gep = getelementptr %T, %T* %p, i64 0, i32 2
+  %a = load i32, i32* %a.gep
+  %b = load i32, i32* %b.gep
+; CHECK-NOT: load
+  %v = add i32 %a, %b
+  ret i32 %v
+; CHECK: ret i32
 }
 
 define i32 @caller() {
-        %V = call i32 @test( %QuadTy* @G )              ; <i32> [#uses=1]
-        ret i32 %V
+; CHECK-LABEL: define i32 @caller(
+entry:
+  %v = call i32 @test(%T* @G)
+; CHECK: %[[B_GEP:.*]] = getelementptr %T, %T* @G, i64 0, i32 2
+; CHECK: %[[B:.*]] = load i32, i32* %[[B_GEP]]
+; CHECK: %[[A_GEP:.*]] = getelementptr %T, %T* @G, i64 0, i32 3
+; CHECK: %[[A:.*]] = load i32, i32* %[[A_GEP]]
+; CHECK: call i32 @test(i32 %[[B]], i32 %[[A]])
+  ret i32 %v
 }
-
diff --git a/test/Transforms/ArgumentPromotion/attrs.ll b/test/Transforms/ArgumentPromotion/attrs.ll
index 46128f93c240..29cef50fe802 100644
--- a/test/Transforms/ArgumentPromotion/attrs.ll
+++ b/test/Transforms/ArgumentPromotion/attrs.ll
@@ -1,25 +1,52 @@
-; RUN: opt < %s -argpromotion -S | grep zeroext
+; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
-	%struct.ss = type { i32, i64 }
+%struct.ss = type { i32, i64 }
 
-define internal void @f(%struct.ss* byval  %b, i32* byval %X, i32 %i) nounwind  {
+; Don't drop 'byval' on %X here.
+define internal void @f(%struct.ss* byval %b, i32* byval %X, i32 %i) nounwind {
+; CHECK-LABEL: define internal void @f(
+; CHECK: i32 %[[B0:.*]], i64 %[[B1:.*]], i32* byval %X, i32 %i)
 entry:
-	%tmp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
-	%tmp1 = load i32, i32* %tmp, align 4
-	%tmp2 = add i32 %tmp1, 1	
-	store i32 %tmp2, i32* %tmp, align 4
+; CHECK: %[[B:.*]] = alloca %struct.ss
+; CHECK: %[[B_GEP0:.*]] = getelementptr %struct.ss, %struct.ss* %[[B]], i32 0, i32 0
+; CHECK: store i32 %[[B0]], i32* %[[B_GEP0]]
+; CHECK: %[[B_GEP1:.*]] = getelementptr %struct.ss, %struct.ss* %[[B]], i32 0, i32 1
+; CHECK: store i64 %[[B1]], i64* %[[B_GEP1]]
 
-	store i32 0, i32* %X
-	ret void
+  %tmp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
+; CHECK: %[[TMP:.*]] = getelementptr %struct.ss, %struct.ss* %[[B]], i32 0, i32 0
+  %tmp1 = load i32, i32* %tmp, align 4
+; CHECK: %[[TMP1:.*]] = load i32, i32* %[[TMP]]
+  %tmp2 = add i32 %tmp1, 1
+; CHECK: %[[TMP2:.*]] = add i32 %[[TMP1]], 1
+  store i32 %tmp2, i32* %tmp, align 4
+; CHECK: store i32 %[[TMP2]], i32* %[[TMP]]
+
+  store i32 0, i32* %X
+; CHECK: store i32 0, i32* %X
+  ret void
 }
 
+; Also make sure we don't drop the call zeroext attribute.
 define i32 @test(i32* %X) {
+; CHECK-LABEL: define i32 @test(
 entry:
-	%S = alloca %struct.ss		; <%struct.ss*> [#uses=4]
-	%tmp1 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 0		; <i32*> [#uses=1]
-	store i32 1, i32* %tmp1, align 8
-	%tmp4 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 1		; <i64*> [#uses=1]
-	store i64 2, i64* %tmp4, align 4
-	call void @f( %struct.ss* byval %S, i32* byval %X, i32 zeroext 0) 
-	ret i32 0
+  %S = alloca %struct.ss
+; CHECK: %[[S:.*]] = alloca %struct.ss
+  %tmp1 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 0
+  store i32 1, i32* %tmp1, align 8
+; CHECK: store i32 1
+  %tmp4 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 1
+  store i64 2, i64* %tmp4, align 4
+; CHECK: store i64 2
+
+  call void @f( %struct.ss* byval %S, i32* byval %X, i32 zeroext 0)
+; CHECK: %[[S_GEP0:.*]] = getelementptr %struct.ss, %struct.ss* %[[S]], i32 0, i32 0
+; CHECK: %[[S0:.*]] = load i32, i32* %[[S_GEP0]]
+; CHECK: %[[S_GEP1:.*]] = getelementptr %struct.ss, %struct.ss* %[[S]], i32 0, i32 1
+; CHECK: %[[S1:.*]] = load i64, i64* %[[S_GEP1]]
+; CHECK: call void @f(i32 %[[S0]], i64 %[[S1]], i32* byval %X, i32 zeroext 0)
+
+  ret i32 0
 }
diff --git a/test/Transforms/ArgumentPromotion/byval-2.ll b/test/Transforms/ArgumentPromotion/byval-2.ll
index 6c0288f5f989..3e1fee8badd9 100644
--- a/test/Transforms/ArgumentPromotion/byval-2.ll
+++ b/test/Transforms/ArgumentPromotion/byval-2.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
 ; Arg promotion eliminates the struct argument.
 ; FIXME: Should it eliminate the i32* argument?
diff --git a/test/Transforms/ArgumentPromotion/byval.ll b/test/Transforms/ArgumentPromotion/byval.ll
index b091b09a3597..58475fc89607 100644
--- a/test/Transforms/ArgumentPromotion/byval.ll
+++ b/test/Transforms/ArgumentPromotion/byval.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
diff --git a/test/Transforms/ArgumentPromotion/callgraph-update.ll b/test/Transforms/ArgumentPromotion/callgraph-update.ll
deleted file mode 100644
index 989043d7ea58..000000000000
--- a/test/Transforms/ArgumentPromotion/callgraph-update.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: opt < %s -argpromotion -simplifycfg -constmerge | llvm-dis
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-target triple = "i386-apple-darwin10.0"
-
-%struct.VEC2 = type { double, double, double }
-%struct.VERTEX = type { %struct.VEC2, %struct.VERTEX*, %struct.VERTEX* }
-%struct.edge_rec = type { %struct.VERTEX*, %struct.edge_rec*, i32, i8* }
-
-declare %struct.edge_rec* @alloc_edge() nounwind ssp
-
-define i64 @build_delaunay(%struct.VERTEX* %tree, %struct.VERTEX* %extra) nounwind ssp {
-entry:
-  br i1 undef, label %bb11, label %bb12
-
-bb11:                                             ; preds = %bb10
-  %a = call %struct.edge_rec* @alloc_edge() nounwind ; <%struct.edge_rec*> [#uses=0]
-  ret i64 123
-
-bb12:                                             ; preds = %bb10
-  %b = call %struct.edge_rec* @alloc_edge() nounwind ; <%struct.edge_rec*> [#uses=1]
-  %c = ptrtoint %struct.edge_rec* %b to i64
-  ret i64 %c
-}
diff --git a/test/Transforms/ArgumentPromotion/chained.ll b/test/Transforms/ArgumentPromotion/chained.ll
index 6ba2e8d48694..028c6c426e52 100644
--- a/test/Transforms/ArgumentPromotion/chained.ll
+++ b/test/Transforms/ArgumentPromotion/chained.ll
@@ -1,17 +1,27 @@
-; RUN: opt < %s -argpromotion -instcombine -S | not grep load
-target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
-@G1 = constant i32 0            ; <i32*> [#uses=1]
-@G2 = constant i32* @G1         ; <i32**> [#uses=1]
+@G1 = constant i32 0
+@G2 = constant i32* @G1
 
-define internal i32 @test(i32** %X) {
-        %Y = load i32*, i32** %X              ; <i32*> [#uses=1]
-        %X.upgrd.1 = load i32, i32* %Y               ; <i32> [#uses=1]
-        ret i32 %X.upgrd.1
+define internal i32 @test(i32** %x) {
+; CHECK-LABEL: define internal i32 @test(
+; CHECK: i32 %{{.*}})
+entry:
+  %y = load i32*, i32** %x
+  %z = load i32, i32* %y
+; CHECK-NOT: load
+  ret i32 %z
+; CHECK: ret i32
 }
 
-define i32 @caller(i32** %P) {
-        %X = call i32 @test( i32** @G2 )                ; <i32> [#uses=1]
-        ret i32 %X
+define i32 @caller() {
+; CHECK-LABEL: define i32 @caller()
+entry:
+  %x = call i32 @test(i32** @G2)
+; CHECK: %[[Y:.*]] = load i32*, i32** @G2
+; CHECK: %[[Z:.*]] = load i32, i32* %[[Y]]
+; CHECK: call i32 @test(i32 %[[Z]])
+  ret i32 %x
 }
 
diff --git a/test/Transforms/ArgumentPromotion/control-flow.ll b/test/Transforms/ArgumentPromotion/control-flow.ll
index cdff36eb83c0..c3fe0c00e877 100644
--- a/test/Transforms/ArgumentPromotion/control-flow.ll
+++ b/test/Transforms/ArgumentPromotion/control-flow.ll
@@ -1,19 +1,27 @@
-; RUN: opt < %s -argpromotion -S | \
-; RUN:    not grep "load i32* null"
+; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
+; Don't promote around control flow.
 define internal i32 @callee(i1 %C, i32* %P) {
-        br i1 %C, label %T, label %F
+; CHECK-LABEL: define internal i32 @callee(
+; CHECK: i1 %C, i32* %P)
+entry:
+  br i1 %C, label %T, label %F
 
-T:              ; preds = %0
-        ret i32 17
+T:
+  ret i32 17
 
-F:              ; preds = %0
-        %X = load i32, i32* %P               ; <i32> [#uses=1]
-        ret i32 %X
+F:
+  %X = load i32, i32* %P
+  ret i32 %X
 }
 
 define i32 @foo() {
-        %X = call i32 @callee( i1 true, i32* null )             ; <i32> [#uses=1]
-        ret i32 %X
+; CHECK-LABEL: define i32 @foo(
+entry:
+; CHECK-NOT: load i32, i32* null
+  %X = call i32 @callee(i1 true, i32* null)
+; CHECK: call i32 @callee(i1 true, i32* null)
+  ret i32 %X
 }
 
diff --git a/test/Transforms/ArgumentPromotion/control-flow2.ll b/test/Transforms/ArgumentPromotion/control-flow2.ll
index 7413f46a860f..b75a32ddb331 100644
--- a/test/Transforms/ArgumentPromotion/control-flow2.ll
+++ b/test/Transforms/ArgumentPromotion/control-flow2.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
 ; CHECK: load i32, i32* %A
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
diff --git a/test/Transforms/ArgumentPromotion/crash.ll b/test/Transforms/ArgumentPromotion/crash.ll
index d3f412da14d9..d46a48101e78 100644
--- a/test/Transforms/ArgumentPromotion/crash.ll
+++ b/test/Transforms/ArgumentPromotion/crash.ll
@@ -1,61 +1,73 @@
-; RUN: opt -inline -argpromotion < %s
-; rdar://7879828
+; RUN: opt -S < %s -inline -argpromotion | FileCheck %s
+; RUN: opt -S < %s -passes=inline,argpromotion | FileCheck %s
 
-define void @foo() personality i32 (...)* @__gxx_personality_v0 {
-  invoke void @foo2()
-          to label %if.end432 unwind label %for.end520 
+%S = type { %S* }
 
-if.end432:  
+; Inlining should nuke the invoke (and any inlined calls) here even with
+; argument promotion running along with it.
+define void @zot() personality i32 (...)* @wibble {
+; CHECK-LABEL: define void @zot() personality i32 (...)* @wibble
+; CHECK-NOT: call
+; CHECK-NOT: invoke
+bb:
+  invoke void @hoge()
+          to label %bb1 unwind label %bb2
+
+bb1:
   unreachable
 
-for.end520: 
-  %exn = landingpad {i8*, i32}
-           cleanup
+bb2:
+  %tmp = landingpad { i8*, i32 }
+          cleanup
   unreachable
 }
 
-define internal  void @foo2() ssp {
-  %call7 = call fastcc i8* @foo3(i1 (i8*)* @foo4)
-  %call58 = call fastcc i8* @foo3(i1 (i8*)* @foo5)
+define internal void @hoge() {
+bb:
+  %tmp = call fastcc i8* @spam(i1 (i8*)* @eggs)
+  %tmp1 = call fastcc i8* @spam(i1 (i8*)* @barney)
   unreachable
 }
 
-define internal fastcc i8* @foo3(i1 (i8*)* %Pred) {
-entry:
+define internal fastcc i8* @spam(i1 (i8*)* %arg) {
+bb:
   unreachable
 }
 
-define internal i1 @foo4(i8* %O) nounwind {
-entry:
-  %call = call zeroext i1 @foo5(i8* %O) ; <i1> [#uses=0]
+define internal i1 @eggs(i8* %arg) {
+bb:
+  %tmp = call zeroext i1 @barney(i8* %arg)
   unreachable
 }
 
-define internal i1 @foo5(i8* %O) nounwind {
-entry:
+define internal i1 @barney(i8* %arg) {
+bb:
   ret i1 undef
 }
 
+define i32 @test_inf_promote_caller(i32 %arg) {
+; CHECK-LABEL: define i32 @test_inf_promote_caller(
+bb:
+  %tmp = alloca %S
+  %tmp1 = alloca %S
+  %tmp2 = call i32 @test_inf_promote_callee(%S* %tmp, %S* %tmp1)
+; CHECK: call i32 @test_inf_promote_callee(%S* %{{.*}}, %S* %{{.*}})
 
-; PR8932 - infinite promotion.
-%0 = type { %0* }
-
-define i32 @test2(i32 %a) {
-init:
-  %0 = alloca %0
-  %1 = alloca %0
-  %2 = call i32 @"clay_assign(Chain, Chain)"(%0* %0, %0* %1)
   ret i32 0
 }
 
-define internal i32 @"clay_assign(Chain, Chain)"(%0* %c, %0* %d) {
-init:
-  %0 = getelementptr %0, %0* %d, i32 0, i32 0
-  %1 = load %0*, %0** %0
-  %2 = getelementptr %0, %0* %c, i32 0, i32 0
-  %3 = load %0*, %0** %2
-  %4 = call i32 @"clay_assign(Chain, Chain)"(%0* %3, %0* %1)
+define internal i32 @test_inf_promote_callee(%S* %arg, %S* %arg1) {
+; CHECK-LABEL: define internal i32 @test_inf_promote_callee(
+; CHECK: %S* %{{.*}}, %S* %{{.*}})
+bb:
+  %tmp = getelementptr %S, %S* %arg1, i32 0, i32 0
+  %tmp2 = load %S*, %S** %tmp
+  %tmp3 = getelementptr %S, %S* %arg, i32 0, i32 0
+  %tmp4 = load %S*, %S** %tmp3
+  %tmp5 = call i32 @test_inf_promote_callee(%S* %tmp4, %S* %tmp2)
+; CHECK: call i32 @test_inf_promote_callee(%S* %{{.*}}, %S* %{{.*}})
+
   ret i32 0
 }
 
-declare i32 @__gxx_personality_v0(...)
+declare i32 @wibble(...)
diff --git a/test/Transforms/ArgumentPromotion/dbg.ll b/test/Transforms/ArgumentPromotion/dbg.ll
index 3d353db105fd..61b7c1843e48 100644
--- a/test/Transforms/ArgumentPromotion/dbg.ll
+++ b/test/Transforms/ArgumentPromotion/dbg.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
 declare void @sink(i32)
 
@@ -23,6 +24,6 @@ define void @caller(i32** %Y) {
 
 !0 = !{i32 2, !"Debug Info Version", i32 3}
 !1 = !DILocation(line: 8, scope: !2)
-!2 = distinct !DISubprogram(name: "test", line: 3, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !3, scopeLine: 3, scope: null)
+!2 = distinct !DISubprogram(name: "test", file: !5, line: 3, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !3, scopeLine: 3, scope: null)
 !3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: LineTablesOnly, file: !5)
 !5 = !DIFile(filename: "test.c", directory: "")
diff --git a/test/Transforms/ArgumentPromotion/fp80.ll b/test/Transforms/ArgumentPromotion/fp80.ll
index 84ef603de82c..bd780fa21aeb 100644
--- a/test/Transforms/ArgumentPromotion/fp80.ll
+++ b/test/Transforms/ArgumentPromotion/fp80.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/ArgumentPromotion/inalloca.ll b/test/Transforms/ArgumentPromotion/inalloca.ll
index 5bf57c8ff465..7ea3b4e42777 100644
--- a/test/Transforms/ArgumentPromotion/inalloca.ll
+++ b/test/Transforms/ArgumentPromotion/inalloca.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -argpromotion -sroa -S | FileCheck %s
+; RUN: opt %s -passes='argpromotion,function(sroa)' -S | FileCheck %s
 
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
diff --git a/test/Transforms/ArgumentPromotion/pr27568.ll b/test/Transforms/ArgumentPromotion/pr27568.ll
index 648317aee0da..1496780748da 100644
--- a/test/Transforms/ArgumentPromotion/pr27568.ll
+++ b/test/Transforms/ArgumentPromotion/pr27568.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -argpromotion < %s | FileCheck %s
+; RUN: opt -S -passes=argpromotion < %s | FileCheck %s
 target triple = "x86_64-pc-windows-msvc"
 
 define internal void @callee(i8*) {
diff --git a/test/Transforms/ArgumentPromotion/profile.ll b/test/Transforms/ArgumentPromotion/profile.ll
new file mode 100644
index 000000000000..f667f9ea2c2a
--- /dev/null
+++ b/test/Transforms/ArgumentPromotion/profile.ll
@@ -0,0 +1,23 @@
+; RUN: opt -argpromotion -mem2reg -S < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+
+; Checks if !prof metadata is corret in deadargelim.
+
+define void @caller() #0 {
+  %x = alloca i32
+  store i32 42, i32* %x
+  call void @promote_i32_ptr(i32* %x), !prof !0
+; CHECK: call void @promote_i32_ptr(i32 42), !prof ![[PROF:[0-9]]]
+  ret void
+}
+
+define internal void @promote_i32_ptr(i32* %xp) {
+  %x = load i32, i32* %xp
+  call void @use_i32(i32 %x)
+  ret void
+}
+
+declare void @use_i32(i32)
+
+; CHECK: ![[PROF]] = !{!"branch_weights", i32 30}
+!0 = !{!"branch_weights", i32 30}
diff --git a/test/Transforms/ArgumentPromotion/reserve-tbaa.ll b/test/Transforms/ArgumentPromotion/reserve-tbaa.ll
index 3c8ed79eeb29..3a3aa44b2a98 100644
--- a/test/Transforms/ArgumentPromotion/reserve-tbaa.ll
+++ b/test/Transforms/ArgumentPromotion/reserve-tbaa.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -argpromotion -S
+; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
 ; PR17906
 ; When we promote two arguments in a single function with different types,
diff --git a/test/Transforms/ArgumentPromotion/sret.ll b/test/Transforms/ArgumentPromotion/sret.ll
index 8e5521f48d10..55fc036f1775 100644
--- a/test/Transforms/ArgumentPromotion/sret.ll
+++ b/test/Transforms/ArgumentPromotion/sret.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
 target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-pc-windows-msvc"
diff --git a/test/Transforms/ArgumentPromotion/tail.ll b/test/Transforms/ArgumentPromotion/tail.ll
index 2ea387cd2645..93de60afe915 100644
--- a/test/Transforms/ArgumentPromotion/tail.ll
+++ b/test/Transforms/ArgumentPromotion/tail.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -argpromotion -S -o - | FileCheck %s
+; RUN: opt %s -passes=argpromotion -S -o - | FileCheck %s
 ; PR14710
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/test/Transforms/ArgumentPromotion/variadic.ll b/test/Transforms/ArgumentPromotion/variadic.ll
index 0e03882d3b20..034f853883fd 100644
--- a/test/Transforms/ArgumentPromotion/variadic.ll
+++ b/test/Transforms/ArgumentPromotion/variadic.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
 ; Unused arguments from variadic functions cannot be eliminated as that changes
 ; their classiciation according to the SysV amd64 ABI. Clang and other frontends
diff --git a/test/Transforms/AtomicExpand/SPARC/libcalls.ll b/test/Transforms/AtomicExpand/SPARC/libcalls.ll
index afab7a39b278..fc6aade8708a 100644
--- a/test/Transforms/AtomicExpand/SPARC/libcalls.ll
+++ b/test/Transforms/AtomicExpand/SPARC/libcalls.ll
@@ -43,11 +43,11 @@ define i16 @test_exchange_i16(i16* %arg, i16 %val) {
 ; CHECK:  %1 = bitcast i16* %arg to i8*
 ; CHECK:  %2 = alloca i16, align 2
 ; CHECK:  %3 = bitcast i16* %2 to i8*
-; CHECK:  call void @llvm.lifetime.start(i64 2, i8* %3)
+; CHECK:  call void @llvm.lifetime.start.p0i8(i64 2, i8* %3)
 ; CHECK:  store i16 %old, i16* %2, align 2
 ; CHECK:  %4 = call zeroext i1 @__atomic_compare_exchange_2(i8* %1, i8* %3, i16 %new, i32 5, i32 0)
 ; CHECK:  %5 = load i16, i16* %2, align 2
-; CHECK:  call void @llvm.lifetime.end(i64 2, i8* %3)
+; CHECK:  call void @llvm.lifetime.end.p0i8(i64 2, i8* %3)
 ; CHECK:  %6 = insertvalue { i16, i1 } undef, i16 %5, 0
 ; CHECK:  %7 = insertvalue { i16, i1 } %6, i1 %4, 1
 ; CHECK:  %ret = extractvalue { i16, i1 } %7, 0
@@ -76,10 +76,10 @@ define i16 @test_add_i16(i16* %arg, i16 %val) {
 ; CHECK:  %1 = bitcast i128* %arg to i8*
 ; CHECK:  %2 = alloca i128, align 8
 ; CHECK:  %3 = bitcast i128* %2 to i8*
-; CHECK:  call void @llvm.lifetime.start(i64 16, i8* %3)
+; CHECK:  call void @llvm.lifetime.start.p0i8(i64 16, i8* %3)
 ; CHECK:  call void @__atomic_load(i32 16, i8* %1, i8* %3, i32 5)
 ; CHECK:  %4 = load i128, i128* %2, align 8
-; CHECK:  call void @llvm.lifetime.end(i64 16, i8* %3)
+; CHECK:  call void @llvm.lifetime.end.p0i8(i64 16, i8* %3)
 ; CHECK:  ret i128 %4
 define i128 @test_load_i128(i128* %arg) {
   %ret = load atomic i128, i128* %arg seq_cst, align 16
@@ -90,10 +90,10 @@ define i128 @test_load_i128(i128* %arg) {
 ; CHECK:  %1 = bitcast i128* %arg to i8*
 ; CHECK:  %2 = alloca i128, align 8
 ; CHECK:  %3 = bitcast i128* %2 to i8*
-; CHECK:  call void @llvm.lifetime.start(i64 16, i8* %3)
+; CHECK:  call void @llvm.lifetime.start.p0i8(i64 16, i8* %3)
 ; CHECK:  store i128 %val, i128* %2, align 8
 ; CHECK:  call void @__atomic_store(i32 16, i8* %1, i8* %3, i32 5)
-; CHECK:  call void @llvm.lifetime.end(i64 16, i8* %3)
+; CHECK:  call void @llvm.lifetime.end.p0i8(i64 16, i8* %3)
 ; CHECK:  ret void
 define void @test_store_i128(i128* %arg, i128 %val) {
   store atomic i128 %val, i128* %arg seq_cst, align 16
@@ -104,15 +104,15 @@ define void @test_store_i128(i128* %arg, i128 %val) {
 ; CHECK:  %1 = bitcast i128* %arg to i8*
 ; CHECK:  %2 = alloca i128, align 8
 ; CHECK:  %3 = bitcast i128* %2 to i8*
-; CHECK:  call void @llvm.lifetime.start(i64 16, i8* %3)
+; CHECK:  call void @llvm.lifetime.start.p0i8(i64 16, i8* %3)
 ; CHECK:  store i128 %val, i128* %2, align 8
 ; CHECK:  %4 = alloca i128, align 8
 ; CHECK:  %5 = bitcast i128* %4 to i8*
-; CHECK:  call void @llvm.lifetime.start(i64 16, i8* %5)
+; CHECK:  call void @llvm.lifetime.start.p0i8(i64 16, i8* %5)
 ; CHECK:  call void @__atomic_exchange(i32 16, i8* %1, i8* %3, i8* %5, i32 5)
-; CHECK:  call void @llvm.lifetime.end(i64 16, i8* %3)
+; CHECK:  call void @llvm.lifetime.end.p0i8(i64 16, i8* %3)
 ; CHECK:  %6 = load i128, i128* %4, align 8
-; CHECK:  call void @llvm.lifetime.end(i64 16, i8* %5)
+; CHECK:  call void @llvm.lifetime.end.p0i8(i64 16, i8* %5)
 ; CHECK:  ret i128 %6
 define i128 @test_exchange_i128(i128* %arg, i128 %val) {
   %ret = atomicrmw xchg i128* %arg, i128 %val seq_cst
@@ -123,16 +123,16 @@ define i128 @test_exchange_i128(i128* %arg, i128 %val) {
 ; CHECK:  %1 = bitcast i128* %arg to i8*
 ; CHECK:  %2 = alloca i128, align 8
 ; CHECK:  %3 = bitcast i128* %2 to i8*
-; CHECK:  call void @llvm.lifetime.start(i64 16, i8* %3)
+; CHECK:  call void @llvm.lifetime.start.p0i8(i64 16, i8* %3)
 ; CHECK:  store i128 %old, i128* %2, align 8
 ; CHECK:  %4 = alloca i128, align 8
 ; CHECK:  %5 = bitcast i128* %4 to i8*
-; CHECK:  call void @llvm.lifetime.start(i64 16, i8* %5)
+; CHECK:  call void @llvm.lifetime.start.p0i8(i64 16, i8* %5)
 ; CHECK:  store i128 %new, i128* %4, align 8
 ; CHECK:  %6 = call zeroext i1 @__atomic_compare_exchange(i32 16, i8* %1, i8* %3, i8* %5, i32 5, i32 0)
-; CHECK:  call void @llvm.lifetime.end(i64 16, i8* %5)
+; CHECK:  call void @llvm.lifetime.end.p0i8(i64 16, i8* %5)
 ; CHECK:  %7 = load i128, i128* %2, align 8
-; CHECK:  call void @llvm.lifetime.end(i64 16, i8* %3)
+; CHECK:  call void @llvm.lifetime.end.p0i8(i64 16, i8* %3)
 ; CHECK:  %8 = insertvalue { i128, i1 } undef, i128 %7, 0
 ; CHECK:  %9 = insertvalue { i128, i1 } %8, i1 %6, 1
 ; CHECK:  %ret = extractvalue { i128, i1 } %9, 0
@@ -157,15 +157,15 @@ define i128 @test_cmpxchg_i128(i128* %arg, i128 %old, i128 %new) {
 ; CHECK:  %new = add i128 %loaded, %val
 ; CHECK:  %4 = bitcast i128* %arg to i8*
 ; CHECK:  %5 = bitcast i128* %1 to i8*
-; CHECK:  call void @llvm.lifetime.start(i64 16, i8* %5)
+; CHECK:  call void @llvm.lifetime.start.p0i8(i64 16, i8* %5)
 ; CHECK:  store i128 %loaded, i128* %1, align 8
 ; CHECK:  %6 = bitcast i128* %2 to i8*
-; CHECK:  call void @llvm.lifetime.start(i64 16, i8* %6)
+; CHECK:  call void @llvm.lifetime.start.p0i8(i64 16, i8* %6)
 ; CHECK:  store i128 %new, i128* %2, align 8
 ; CHECK:  %7 = call zeroext i1 @__atomic_compare_exchange(i32 16, i8* %4, i8* %5, i8* %6, i32 5, i32 5)
-; CHECK:  call void @llvm.lifetime.end(i64 16, i8* %6)
+; CHECK:  call void @llvm.lifetime.end.p0i8(i64 16, i8* %6)
 ; CHECK:  %8 = load i128, i128* %1, align 8
-; CHECK:  call void @llvm.lifetime.end(i64 16, i8* %5)
+; CHECK:  call void @llvm.lifetime.end.p0i8(i64 16, i8* %5)
 ; CHECK:  %9 = insertvalue { i128, i1 } undef, i128 %8, 0
 ; CHECK:  %10 = insertvalue { i128, i1 } %9, i1 %7, 1
 ; CHECK:  %success = extractvalue { i128, i1 } %10, 1
@@ -204,12 +204,12 @@ define void @test_store_double(double* %arg, double %val) {
 ; CHECK:   %1 = bitcast i16** %arg to i8*
 ; CHECK:   %2 = alloca i16*, align 4
 ; CHECK:   %3 = bitcast i16** %2 to i8*
-; CHECK:   call void @llvm.lifetime.start(i64 4, i8* %3)
+; CHECK:   call void @llvm.lifetime.start.p0i8(i64 4, i8* %3)
 ; CHECK:   store i16* %old, i16** %2, align 4
 ; CHECK:   %4 = ptrtoint i16* %new to i32
 ; CHECK:   %5 = call zeroext i1 @__atomic_compare_exchange_4(i8* %1, i8* %3, i32 %4, i32 5, i32 2)
 ; CHECK:   %6 = load i16*, i16** %2, align 4
-; CHECK:   call void @llvm.lifetime.end(i64 4, i8* %3)
+; CHECK:   call void @llvm.lifetime.end.p0i8(i64 4, i8* %3)
 ; CHECK:   %7 = insertvalue { i16*, i1 } undef, i16* %6, 0
 ; CHECK:   %8 = insertvalue { i16*, i1 } %7, i1 %5, 1
 ; CHECK:   %ret = extractvalue { i16*, i1 } %8, 0
@@ -227,10 +227,10 @@ define i16* @test_cmpxchg_ptr(i16** %arg, i16* %old, i16* %new) {
 ; CHECK:   %1 = bitcast fp128* %arg to i8*
 ; CHECK:  %2 = alloca fp128, align 8
 ; CHECK:  %3 = bitcast fp128* %2 to i8*
-; CHECK:  call void @llvm.lifetime.start(i64 16, i8* %3)
+; CHECK:  call void @llvm.lifetime.start.p0i8(i64 16, i8* %3)
 ; CHECK:  store fp128 %val, fp128* %2, align 8
 ; CHECK:  call void @__atomic_store(i32 16, i8* %1, i8* %3, i32 5)
-; CHECK:  call void @llvm.lifetime.end(i64 16, i8* %3)
+; CHECK:  call void @llvm.lifetime.end.p0i8(i64 16, i8* %3)
 ; CHECK:  ret void
 define void @test_store_fp128(fp128* %arg, fp128 %val) {
   store atomic fp128 %val, fp128* %arg seq_cst, align 16
diff --git a/test/Transforms/BBVectorize/X86/loop1.ll b/test/Transforms/BBVectorize/X86/loop1.ll
index 551fbd73eb27..a533713609a7 100644
--- a/test/Transforms/BBVectorize/X86/loop1.ll
+++ b/test/Transforms/BBVectorize/X86/loop1.ll
@@ -1,7 +1,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -basicaa -loop-unroll -unroll-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -basicaa -loop-unroll -unroll-partial-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL
 ; The second check covers the use of alias analysis (with loop unrolling).
 
 define void @test1(double* noalias %out, double* noalias %in1, double* noalias %in2) nounwind uwtable {
diff --git a/test/Transforms/BBVectorize/X86/wr-aliases.ll b/test/Transforms/BBVectorize/X86/wr-aliases.ll
index a6ea27fc3ecb..e34414988f32 100644
--- a/test/Transforms/BBVectorize/X86/wr-aliases.ll
+++ b/test/Transforms/BBVectorize/X86/wr-aliases.ll
@@ -14,7 +14,7 @@ declare fastcc void @_ZL12printQBezier7QBezier(%class.QBezier.15* byval nocaptur
 declare void @llvm.lifetime.start(i64, i8* nocapture) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0
 
 define void @main_arrayctor.cont([10 x %class.QBezier.15]* %beziers, %class.QBezier.15* %agg.tmp.i, %class.QBezier.15* %agg.tmp55.i, %class.QBezier.15* %agg.tmp56.i) {
 newFuncRoot:
@@ -134,9 +134,9 @@ arrayctor.cont:                                   ; preds = %newFuncRoot
   call fastcc void @_ZL12printQBezier7QBezier(%class.QBezier.15* byval align 8 %agg.tmp55.i)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %v2, i8* %v3, i64 64, i32 8, i1 false)
   call fastcc void @_ZL12printQBezier7QBezier(%class.QBezier.15* byval align 8 %agg.tmp56.i)
-  call void @llvm.lifetime.end(i64 64, i8* %v0)
-  call void @llvm.lifetime.end(i64 64, i8* %v1)
-  call void @llvm.lifetime.end(i64 64, i8* %v2)
+  call void @llvm.lifetime.end.p0i8(i64 64, i8* %v0)
+  call void @llvm.lifetime.end.p0i8(i64 64, i8* %v1)
+  call void @llvm.lifetime.end.p0i8(i64 64, i8* %v2)
   br label %arrayctor.cont.ret.exitStub
 }
 
diff --git a/test/Transforms/BBVectorize/loop1.ll b/test/Transforms/BBVectorize/loop1.ll
index 7e7b603116fc..8ff5953cf46a 100644
--- a/test/Transforms/BBVectorize/loop1.ll
+++ b/test/Transforms/BBVectorize/loop1.ll
@@ -1,7 +1,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 ; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s
-; RUN: opt < %s -dont-improve-non-negative-phi-bits=false -basicaa -loop-unroll -unroll-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL
+; RUN: opt < %s -dont-improve-non-negative-phi-bits=false -basicaa -loop-unroll -unroll-threshold=45 -unroll-partial-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL
 ; The second check covers the use of alias analysis (with loop unrolling).
 
 define void @test1(double* noalias %out, double* noalias %in1, double* noalias %in2) nounwind uwtable {
diff --git a/test/Transforms/BDCE/basic.ll b/test/Transforms/BDCE/basic.ll
index 6e748c69a16e..6132c5d797fc 100644
--- a/test/Transforms/BDCE/basic.ll
+++ b/test/Transforms/BDCE/basic.ll
@@ -136,6 +136,44 @@ entry:
 declare i32 @llvm.bswap.i32(i32) #0
 
 ; Function Attrs: nounwind readnone
+define signext i32 @tim(i32 signext %x) #0 {
+entry:
+  %call = tail call signext i32 @foo(i32 signext 5) #0
+  %and = and i32 %call, 536870912
+  %or = or i32 %and, %x
+  %call1 = tail call signext i32 @foo(i32 signext 3) #0
+  %and2 = and i32 %call1, 1073741824
+  %or3 = or i32 %or, %and2
+  %call4 = tail call signext i32 @foo(i32 signext 2) #0
+  %and5 = and i32 %call4, 16
+  %or6 = or i32 %or3, %and5
+  %call7 = tail call signext i32 @foo(i32 signext 1) #0
+  %and8 = and i32 %call7, 32
+  %or9 = or i32 %or6, %and8
+  %call10 = tail call signext i32 @foo(i32 signext 0) #0
+  %and11 = and i32 %call10, 64
+  %or12 = or i32 %or9, %and11
+  %call13 = tail call signext i32 @foo(i32 signext 4) #0
+  %and14 = and i32 %call13, 128
+  %or15 = or i32 %or12, %and14
+  %bs = tail call i32 @llvm.bitreverse.i32(i32 %or15) #0
+  %shr = ashr i32 %bs, 4
+  ret i32 %shr
+
+; CHECK-LABEL: @tim
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 5)
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 3)
+; CHECK: tail call signext i32 @foo(i32 signext 2)
+; CHECK: tail call signext i32 @foo(i32 signext 1)
+; CHECK: tail call signext i32 @foo(i32 signext 0)
+; CHECK: tail call signext i32 @foo(i32 signext 4)
+; CHECK: ret i32
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.bitreverse.i32(i32) #0
+
+; Function Attrs: nounwind readnone
 define signext i32 @tar2(i32 signext %x) #0 {
 entry:
   %call = tail call signext i32 @foo(i32 signext 5) #0
diff --git a/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll b/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll
index 6cec253bbf9b..2bcb3a9d1e3d 100644
--- a/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll
+++ b/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll
@@ -5,7 +5,7 @@
 ; ASC-NOT: ptrtoint
 ; ASC-NOT: inttoptr
 
-define void @test_sink_ptrtoint_asc(float addrspace(1)* nocapture %arg, float addrspace(1)* nocapture readonly %arg1, float addrspace(3)* %arg2) #0 {
+define amdgpu_kernel void @test_sink_ptrtoint_asc(float addrspace(1)* nocapture %arg, float addrspace(1)* nocapture readonly %arg1, float addrspace(3)* %arg2) #0 {
 bb:
   %tmp = getelementptr inbounds float, float addrspace(3)* %arg2, i32 16
   %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-special-cases.ll b/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-special-cases.ll
new file mode 100644
index 000000000000..dfa81b54cc3d
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-special-cases.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -codegenprepare < %s | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; No bypassing should be done in apparently unsuitable cases.
+define void @Test_no_bypassing(i32 %a, i64 %b, i64* %retptr) {
+; CHECK-LABEL: @Test_no_bypassing(
+; CHECK-NEXT:    [[A_1:%.*]] = zext i32 [[A:%.*]] to i64
+; CHECK-NEXT:    [[A_2:%.*]] = sub i64 -1, [[A_1]]
+; CHECK-NEXT:    [[RES:%.*]] = srem i64 [[A_2]], [[B:%.*]]
+; CHECK-NEXT:    store i64 [[RES]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %a.1 = zext i32 %a to i64
+  ; %a.2 is always negative so the division cannot be bypassed.
+  %a.2 = sub i64 -1, %a.1
+  %res = srem i64 %a.2, %b
+  store i64 %res, i64* %retptr
+  ret void
+}
+
+; No OR instruction is needed if one of the operands (divisor) is known
+; to fit into 32 bits.
+define void @Test_check_one_operand(i64 %a, i32 %b, i64* %retptr) {
+; CHECK-LABEL: @Test_check_one_operand(
+; CHECK-NEXT:    [[B_1:%.*]] = zext i32 [[B:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[A:%.*]], -4294967296
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP8:%.*]]
+; CHECK:         [[TMP4:%.*]] = trunc i64 [[B_1]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[A]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    br label [[TMP10:%.*]]
+; CHECK:         [[TMP9:%.*]] = sdiv i64 [[A]], [[B_1]]
+; CHECK-NEXT:    br label [[TMP10]]
+; CHECK:         [[TMP11:%.*]] = phi i64 [ [[TMP7]], [[TMP3]] ], [ [[TMP9]], [[TMP8]] ]
+; CHECK-NEXT:    store i64 [[TMP11]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %b.1 = zext i32 %b to i64
+  %res = sdiv i64 %a, %b.1
+  store i64 %res, i64* %retptr
+  ret void
+}
+
+; If both operands are known to fit into 32 bits, then replace the division
+; in-place without CFG modification.
+define void @Test_check_none(i64 %a, i32 %b, i64* %retptr) {
+; CHECK-LABEL: @Test_check_none(
+; CHECK-NEXT:    [[A_1:%.*]] = and i64 [[A:%.*]], 4294967295
+; CHECK-NEXT:    [[B_1:%.*]] = zext i32 [[B:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[A_1]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[B_1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = udiv i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    store i64 [[TMP4]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %a.1 = and i64 %a, 4294967295
+  %b.1 = zext i32 %b to i64
+  %res = udiv i64 %a.1, %b.1
+  store i64 %res, i64* %retptr
+  ret void
+}
+
+; In case of unsigned long division with a short dividend,
+; the long division is not needed any more.
+define void @Test_special_case(i32 %a, i64 %b, i64* %retptr) {
+; CHECK-LABEL: @Test_special_case(
+; CHECK-NEXT:    [[A_1:%.*]] = zext i32 [[A:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge i64 [[A_1]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP9:%.*]]
+; CHECK:         [[TMP3:%.*]] = trunc i64 [[B]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[A_1]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = urem i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    br label [[TMP9]]
+; CHECK:         [[TMP10:%.*]] = phi i64 [ [[TMP7]], [[TMP2]] ], [ 0, [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = phi i64 [ [[TMP8]], [[TMP2]] ], [ [[A_1]], [[TMP0]] ]
+; CHECK-NEXT:    [[RES:%.*]] = add i64 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    store i64 [[RES]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %a.1 = zext i32 %a to i64
+  %div = udiv i64 %a.1, %b
+  %rem = urem i64 %a.1, %b
+  %res = add i64 %div, %rem
+  store i64 %res, i64* %retptr
+  ret void
+}
+
+
+; Do not bypass a division if one of the operands looks like a hash value.
+define void @Test_dont_bypass_xor(i64 %a, i64 %b, i64 %l, i64* %retptr) {
+; CHECK-LABEL: @Test_dont_bypass_xor(
+; CHECK-NEXT:    [[C:%.*]] = xor i64 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = udiv i64 [[C]], [[L:%.*]]
+; CHECK-NEXT:    store i64 [[RES]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %c = xor i64 %a, %b
+  %res = udiv i64 %c, %l
+  store i64 %res, i64* %retptr
+  ret void
+}
+
+define void @Test_dont_bypass_phi_xor(i64 %a, i64 %b, i64 %l, i64* %retptr) {
+; CHECK-LABEL: @Test_dont_bypass_phi_xor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[B:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[MERGE:%.*]], label [[XORPATH:%.*]]
+; CHECK:       xorpath:
+; CHECK-NEXT:    [[C:%.*]] = xor i64 [[A:%.*]], [[B]]
+; CHECK-NEXT:    br label [[MERGE]]
+; CHECK:       merge:
+; CHECK-NEXT:    [[E:%.*]] = phi i64 [ undef, [[ENTRY:%.*]] ], [ [[C]], [[XORPATH]] ]
+; CHECK-NEXT:    [[RES:%.*]] = sdiv i64 [[E]], [[L:%.*]]
+; CHECK-NEXT:    store i64 [[RES]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i64 %b, 0
+  br i1 %cmp, label %merge, label %xorpath
+
+xorpath:
+  %c = xor i64 %a, %b
+  br label %merge
+
+merge:
+  %e = phi i64 [ undef, %entry ], [ %c, %xorpath ]
+  %res = sdiv i64 %e, %l
+  store i64 %res, i64* %retptr
+  ret void
+}
+
+define void @Test_dont_bypass_mul_long_const(i64 %a, i64 %l, i64* %retptr) {
+; CHECK-LABEL: @Test_dont_bypass_mul_long_const(
+; CHECK-NEXT:    [[C:%.*]] = mul i64 [[A:%.*]], 5229553307
+; CHECK-NEXT:    [[RES:%.*]] = urem i64 [[C]], [[L:%.*]]
+; CHECK-NEXT:    store i64 [[RES]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %c = mul i64 %a, 5229553307 ; the constant doesn't fit 32 bits
+  %res = urem i64 %c, %l
+  store i64 %res, i64* %retptr
+  ret void
+}
+
+define void @Test_bypass_phi_mul_const(i64 %a, i64 %b, i64* %retptr) {
+; CHECK-LABEL: @Test_bypass_phi_mul_const(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_MUL:%.*]] = mul nsw i64 [[A:%.*]], 34806414968801
+; CHECK-NEXT:    [[P:%.*]] = icmp sgt i64 [[A]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[P]], label [[BRANCH:%.*]], label [[MERGE:%.*]]
+; CHECK:       branch:
+; CHECK-NEXT:    br label [[MERGE]]
+; CHECK:       merge:
+; CHECK-NEXT:    [[LHS:%.*]] = phi i64 [ 42, [[BRANCH]] ], [ [[A_MUL]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = or i64 [[LHS]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], -4294967296
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP8:%.*]]
+; CHECK:         [[TMP4:%.*]] = trunc i64 [[B]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[LHS]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    br label [[TMP10:%.*]]
+; CHECK:         [[TMP9:%.*]] = sdiv i64 [[LHS]], [[B]]
+; CHECK-NEXT:    br label [[TMP10]]
+; CHECK:         [[TMP11:%.*]] = phi i64 [ [[TMP7]], [[TMP3]] ], [ [[TMP9]], [[TMP8]] ]
+; CHECK-NEXT:    store i64 [[TMP11]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a.mul = mul nsw i64 %a, 34806414968801
+  %p = icmp sgt i64 %a, %b
+  br i1 %p, label %branch, label %merge
+
+branch:
+  br label %merge
+
+merge:
+  %lhs = phi i64 [ 42, %branch ], [ %a.mul, %entry ]
+  %res = sdiv i64 %lhs, %b
+  store i64 %res, i64* %retptr
+  ret void
+}
+
+define void @Test_bypass_mul_short_const(i64 %a, i64 %l, i64* %retptr) {
+; CHECK-LABEL: @Test_bypass_mul_short_const(
+; CHECK-NEXT:    [[C:%.*]] = mul i64 [[A:%.*]], -42
+; CHECK-NEXT:    [[TMP1:%.*]] = or i64 [[C]], [[L:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], -4294967296
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP9:%.*]]
+; CHECK:         [[TMP5:%.*]] = trunc i64 [[L]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i64 [[C]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = urem i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+; CHECK-NEXT:    br label [[TMP11:%.*]]
+; CHECK:         [[TMP10:%.*]] = urem i64 [[C]], [[L]]
+; CHECK-NEXT:    br label [[TMP11]]
+; CHECK:         [[TMP12:%.*]] = phi i64 [ [[TMP8]], [[TMP4]] ], [ [[TMP10]], [[TMP9]] ]
+; CHECK-NEXT:    store i64 [[TMP12]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %c = mul i64 %a, -42
+  %res = urem i64 %c, %l
+  store i64 %res, i64* %retptr
+  ret void
+}
diff --git a/test/Transforms/CodeGenPrepare/X86/computedgoto.ll b/test/Transforms/CodeGenPrepare/X86/computedgoto.ll
new file mode 100644
index 000000000000..00a4df9b2c59
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/X86/computedgoto.ll
@@ -0,0 +1,294 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -codegenprepare -S < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @use(i32) local_unnamed_addr
+declare void @useptr([2 x i8*]*) local_unnamed_addr
+
+; CHECK: @simple.targets = constant [2 x i8*] [i8* blockaddress(@simple, %bb0), i8* blockaddress(@simple, %bb1)], align 16
+@simple.targets = constant [2 x i8*] [i8* blockaddress(@simple, %bb0), i8* blockaddress(@simple, %bb1)], align 16
+
+; CHECK: @multi.targets = constant [2 x i8*] [i8* blockaddress(@multi, %bb0), i8* blockaddress(@multi, %bb1)], align 16
+@multi.targets = constant [2 x i8*] [i8* blockaddress(@multi, %bb0), i8* blockaddress(@multi, %bb1)], align 16
+
+; CHECK: @loop.targets = constant [2 x i8*] [i8* blockaddress(@loop, %bb0), i8* blockaddress(@loop, %bb1)], align 16
+@loop.targets = constant [2 x i8*] [i8* blockaddress(@loop, %bb0), i8* blockaddress(@loop, %bb1)], align 16
+
+; CHECK: @nophi.targets = constant [2 x i8*] [i8* blockaddress(@nophi, %bb0), i8* blockaddress(@nophi, %bb1)], align 16
+@nophi.targets = constant [2 x i8*] [i8* blockaddress(@nophi, %bb0), i8* blockaddress(@nophi, %bb1)], align 16
+
+; CHECK: @noncritical.targets = constant [2 x i8*] [i8* blockaddress(@noncritical, %bb0), i8* blockaddress(@noncritical, %bb1)], align 16
+@noncritical.targets = constant [2 x i8*] [i8* blockaddress(@noncritical, %bb0), i8* blockaddress(@noncritical, %bb1)], align 16
+
+; Check that we break the critical edge when an jump table has only one use.
+define void @simple(i32* nocapture readonly %p) {
+; CHECK-LABEL: @simple(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[INITVAL:%.*]] = load i32, i32* [[P]], align 4
+; CHECK-NEXT:    [[INITOP:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    switch i32 [[INITOP]], label [[EXIT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BB0_CLONE:%.*]]
+; CHECK-NEXT:    i32 1, label [[BB1_CLONE:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
+; CHECK:       .split:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i32* [ [[PTR:%.*]], [[BB0:%.*]] ], [ [[INCDEC_PTR]], [[BB0_CLONE]] ]
+; CHECK-NEXT:    [[MERGE2:%.*]] = phi i32 [ 0, [[BB0]] ], [ [[INITVAL]], [[BB0_CLONE]] ]
+; CHECK-NEXT:    tail call void @use(i32 [[MERGE2]])
+; CHECK-NEXT:    br label [[INDIRECTGOTO:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[DOTSPLIT3:%.*]]
+; CHECK:       .split3:
+; CHECK-NEXT:    [[MERGE5:%.*]] = phi i32* [ [[PTR]], [[BB1:%.*]] ], [ [[INCDEC_PTR]], [[BB1_CLONE]] ]
+; CHECK-NEXT:    [[MERGE7:%.*]] = phi i32 [ 1, [[BB1]] ], [ [[INITVAL]], [[BB1_CLONE]] ]
+; CHECK-NEXT:    tail call void @use(i32 [[MERGE7]])
+; CHECK-NEXT:    br label [[INDIRECTGOTO]]
+; CHECK:       indirectgoto:
+; CHECK-NEXT:    [[P_ADDR_SINK:%.*]] = phi i32* [ [[MERGE5]], [[DOTSPLIT3]] ], [ [[MERGE]], [[DOTSPLIT]] ]
+; CHECK-NEXT:    [[PTR]] = getelementptr inbounds i32, i32* [[P_ADDR_SINK]], i64 1
+; CHECK-NEXT:    [[NEWP:%.*]] = load i32, i32* [[P_ADDR_SINK]], align 4
+; CHECK-NEXT:    [[IDX:%.*]] = sext i32 [[NEWP]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* @simple.targets, i64 0, i64 [[IDX]]
+; CHECK-NEXT:    [[NEWOP:%.*]] = load i8*, i8** [[ARRAYIDX]], align 8
+; CHECK-NEXT:    indirectbr i8* [[NEWOP]], [label [[BB0]], label %bb1]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+; CHECK:       bb0.clone:
+; CHECK-NEXT:    br label [[DOTSPLIT]]
+; CHECK:       bb1.clone:
+; CHECK-NEXT:    br label [[DOTSPLIT3]]
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %p, i64 1
+  %initval = load i32, i32* %p, align 4
+  %initop = load i32, i32* %incdec.ptr, align 4
+  switch i32 %initop, label %exit [
+  i32 0, label %bb0
+  i32 1, label %bb1
+  ]
+
+bb0:
+  %p.addr.0 = phi i32* [ %incdec.ptr, %entry ], [ %ptr, %indirectgoto ]
+  %opcode.0 = phi i32 [ %initval, %entry ], [ 0, %indirectgoto ]
+  tail call void @use(i32 %opcode.0)
+  br label %indirectgoto
+
+bb1:
+  %p.addr.1 = phi i32* [ %incdec.ptr, %entry ], [ %ptr, %indirectgoto ]
+  %opcode.1 = phi i32 [ %initval, %entry ], [ 1, %indirectgoto ]
+  tail call void @use(i32 %opcode.1)
+  br label %indirectgoto
+
+indirectgoto:
+  %p.addr.sink = phi i32* [ %p.addr.1, %bb1 ], [ %p.addr.0, %bb0 ]
+  %ptr = getelementptr inbounds i32, i32* %p.addr.sink, i64 1
+  %newp = load i32, i32* %p.addr.sink, align 4
+  %idx = sext i32 %newp to i64
+  %arrayidx = getelementptr inbounds [2 x i8*], [2 x i8*]* @simple.targets, i64 0, i64 %idx
+  %newop = load i8*, i8** %arrayidx, align 8
+  indirectbr i8* %newop, [label %bb0, label %bb1]
+
+exit:
+  ret void
+}
+
+; Don't try to break critical edges when several indirectbr point to a single block
+define void @multi(i32* nocapture readonly %p) {
+; CHECK-LABEL: @multi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[INITVAL:%.*]] = load i32, i32* [[P]], align 4
+; CHECK-NEXT:    [[INITOP:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    switch i32 [[INITOP]], label [[EXIT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BB0:%.*]]
+; CHECK-NEXT:    i32 1, label [[BB1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[P_ADDR_0:%.*]] = phi i32* [ [[INCDEC_PTR]], [[ENTRY:%.*]] ], [ [[NEXT0:%.*]], [[BB0]] ], [ [[NEXT1:%.*]], [[BB1]] ]
+; CHECK-NEXT:    [[OPCODE_0:%.*]] = phi i32 [ [[INITVAL]], [[ENTRY]] ], [ 0, [[BB0]] ], [ 1, [[BB1]] ]
+; CHECK-NEXT:    tail call void @use(i32 [[OPCODE_0]])
+; CHECK-NEXT:    [[NEXT0]] = getelementptr inbounds i32, i32* [[P_ADDR_0]], i64 1
+; CHECK-NEXT:    [[NEWP0:%.*]] = load i32, i32* [[P_ADDR_0]], align 4
+; CHECK-NEXT:    [[IDX0:%.*]] = sext i32 [[NEWP0]] to i64
+; CHECK-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* @multi.targets, i64 0, i64 [[IDX0]]
+; CHECK-NEXT:    [[NEWOP0:%.*]] = load i8*, i8** [[ARRAYIDX0]], align 8
+; CHECK-NEXT:    indirectbr i8* [[NEWOP0]], [label [[BB0]], label %bb1]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[P_ADDR_1:%.*]] = phi i32* [ [[INCDEC_PTR]], [[ENTRY]] ], [ [[NEXT0]], [[BB0]] ], [ [[NEXT1]], [[BB1]] ]
+; CHECK-NEXT:    [[OPCODE_1:%.*]] = phi i32 [ [[INITVAL]], [[ENTRY]] ], [ 0, [[BB0]] ], [ 1, [[BB1]] ]
+; CHECK-NEXT:    tail call void @use(i32 [[OPCODE_1]])
+; CHECK-NEXT:    [[NEXT1]] = getelementptr inbounds i32, i32* [[P_ADDR_1]], i64 1
+; CHECK-NEXT:    [[NEWP1:%.*]] = load i32, i32* [[P_ADDR_1]], align 4
+; CHECK-NEXT:    [[IDX1:%.*]] = sext i32 [[NEWP1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* @multi.targets, i64 0, i64 [[IDX1]]
+; CHECK-NEXT:    [[NEWOP1:%.*]] = load i8*, i8** [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    indirectbr i8* [[NEWOP1]], [label [[BB0]], label %bb1]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %p, i64 1
+  %initval = load i32, i32* %p, align 4
+  %initop = load i32, i32* %incdec.ptr, align 4
+  switch i32 %initop, label %exit [
+  i32 0, label %bb0
+  i32 1, label %bb1
+  ]
+
+bb0:
+  %p.addr.0 = phi i32* [ %incdec.ptr, %entry ], [ %next0, %bb0 ], [ %next1, %bb1 ]
+  %opcode.0 = phi i32 [ %initval, %entry ], [ 0, %bb0 ], [ 1, %bb1 ]
+  tail call void @use(i32 %opcode.0)
+  %next0 = getelementptr inbounds i32, i32* %p.addr.0, i64 1
+  %newp0 = load i32, i32* %p.addr.0, align 4
+  %idx0 = sext i32 %newp0 to i64
+  %arrayidx0 = getelementptr inbounds [2 x i8*], [2 x i8*]* @multi.targets, i64 0, i64 %idx0
+  %newop0 = load i8*, i8** %arrayidx0, align 8
+  indirectbr i8* %newop0, [label %bb0, label %bb1]
+
+bb1:
+  %p.addr.1 = phi i32* [ %incdec.ptr, %entry ], [ %next0, %bb0 ], [ %next1, %bb1 ]
+  %opcode.1 = phi i32 [ %initval, %entry ], [ 0, %bb0 ], [ 1, %bb1 ]
+  tail call void @use(i32 %opcode.1)
+  %next1 = getelementptr inbounds i32, i32* %p.addr.1, i64 1
+  %newp1 = load i32, i32* %p.addr.1, align 4
+  %idx1 = sext i32 %newp1 to i64
+  %arrayidx1 = getelementptr inbounds [2 x i8*], [2 x i8*]* @multi.targets, i64 0, i64 %idx1
+  %newop1 = load i8*, i8** %arrayidx1, align 8
+  indirectbr i8* %newop1, [label %bb0, label %bb1]
+
+exit:
+  ret void
+}
+
+; Make sure we do the right thing for cases where the indirectbr branches to
+; the block it terminates.
+define void @loop(i64* nocapture readonly %p) {
+; CHECK-LABEL: @loop(
+; CHECK-NEXT:  bb0.clone:
+; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    br label [[DOTSPLIT]]
+; CHECK:       .split:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[BB0:%.*]] ], [ 0, [[BB0_CLONE:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[MERGE]]
+; CHECK-NEXT:    store i64 [[MERGE]], i64* [[TMP0]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[MERGE]], 1
+; CHECK-NEXT:    [[IDX:%.*]] = srem i64 [[MERGE]], 2
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* @loop.targets, i64 0, i64 [[IDX]]
+; CHECK-NEXT:    [[TARGET:%.*]] = load i8*, i8** [[ARRAYIDX]], align 8
+; CHECK-NEXT:    indirectbr i8* [[TARGET]], [label [[BB0]], label %bb1]
+; CHECK:       bb1:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %bb0
+
+bb0:
+  %i = phi i64 [ %i.next, %bb0 ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i64, i64* %p, i64 %i
+  store i64 %i, i64* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %idx = srem i64 %i, 2
+  %arrayidx = getelementptr inbounds [2 x i8*], [2 x i8*]* @loop.targets, i64 0, i64 %idx
+  %target = load i8*, i8** %arrayidx, align 8
+  indirectbr i8* %target, [label %bb0, label %bb1]
+
+bb1:
+  ret void
+}
+
+; Don't do anything for cases that contain no phis.
+define void @nophi(i32* %p) {
+; CHECK-LABEL: @nophi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[INITOP:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    switch i32 [[INITOP]], label [[EXIT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BB0:%.*]]
+; CHECK-NEXT:    i32 1, label [[BB1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    tail call void @use(i32 0)
+; CHECK-NEXT:    br label [[INDIRECTGOTO:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    tail call void @use(i32 1)
+; CHECK-NEXT:    br label [[INDIRECTGOTO]]
+; CHECK:       indirectgoto:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to i8*
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, i8* [[TMP0]], i64 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[SUNKADDR]] to i32*
+; CHECK-NEXT:    [[NEWP:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[IDX:%.*]] = sext i32 [[NEWP]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* @nophi.targets, i64 0, i64 [[IDX]]
+; CHECK-NEXT:    [[NEWOP:%.*]] = load i8*, i8** [[ARRAYIDX]], align 8
+; CHECK-NEXT:    indirectbr i8* [[NEWOP]], [label [[BB0]], label %bb1]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %p, i64 1
+  %initop = load i32, i32* %incdec.ptr, align 4
+  switch i32 %initop, label %exit [
+  i32 0, label %bb0
+  i32 1, label %bb1
+  ]
+
+bb0:
+  tail call void @use(i32 0)  br label %indirectgoto
+
+bb1:
+  tail call void @use(i32 1)
+  br label %indirectgoto
+
+indirectgoto:
+  %newp = load i32, i32* %incdec.ptr, align 4
+  %idx = sext i32 %newp to i64
+  %arrayidx = getelementptr inbounds [2 x i8*], [2 x i8*]* @nophi.targets, i64 0, i64 %idx
+  %newop = load i8*, i8** %arrayidx, align 8
+  indirectbr i8* %newop, [label %bb0, label %bb1]
+
+exit:
+  ret void
+}
+
+; Don't do anything if the edge isn't critical.
+define i32 @noncritical(i32 %k, i8* %p)
+; CHECK-LABEL: @noncritical(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[D:%.*]] = add i32 [[K:%.*]], 1
+; CHECK-NEXT:    indirectbr i8* [[P:%.*]], [label [[BB0:%.*]], label %bb1]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[R0:%.*]] = sub i32 [[K]], [[D]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[R1:%.*]] = sub i32 [[D]], [[K]]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[V:%.*]] = phi i32 [ [[R0]], [[BB0]] ], [ [[R1]], [[BB1:%.*]] ]
+; CHECK-NEXT:    ret i32 0
+;
+{
+entry:
+  %d = add i32 %k, 1
+  indirectbr i8* %p, [label %bb0, label %bb1]
+
+bb0:
+  %v00 = phi i32 [%k, %entry]
+  %v01 = phi i32 [%d, %entry]
+  %r0 = sub i32 %v00, %v01
+  br label %exit
+
+bb1:
+  %v10 = phi i32 [%d, %entry]
+  %v11 = phi i32 [%k, %entry]
+  %r1 = sub i32 %v10, %v11
+  br label %exit
+
+exit:
+  %v = phi i32 [%r0, %bb0], [%r1, %bb1]
+  ret i32 0
+}
diff --git a/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll b/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
index 5c0b5f3839d0..9d6e668167fb 100644
--- a/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
+++ b/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; Can we sink single addressing mode computation to use?
 define void @test1(i1 %cond, i64* %base) {
 ; CHECK-LABEL: @test1
-; CHECK: add i64 {{.+}}, 40
+; CHECK: getelementptr i8, {{.+}} 40
 entry:
   %addr = getelementptr inbounds i64, i64* %base, i64 5
   %casted = bitcast i64* %addr to i32*
@@ -33,7 +33,7 @@ entry:
 
 if.then:
 ; CHECK-LABEL: if.then:
-; CHECK: add i64 {{.+}}, 40
+; CHECK: getelementptr i8, {{.+}} 40
   %v1 = load i32, i32* %casted, align 4
   call void @foo(i32 %v1)
   %cmp = icmp eq i32 %v1, 0
@@ -41,7 +41,7 @@ if.then:
 
 next:
 ; CHECK-LABEL: next:
-; CHECK: add i64 {{.+}}, 40
+; CHECK: getelementptr i8, {{.+}} 40
   %v2 = load i32, i32* %casted, align 4
   call void @foo(i32 %v2)
   br label %fallthrough
@@ -61,10 +61,10 @@ entry:
 
 if.then:
 ; CHECK-LABEL: if.then:
-; CHECK: add i64 {{.+}}, 40
+; CHECK: getelementptr i8, {{.+}} 40
   %v1 = load i32, i32* %casted, align 4
   call void @foo(i32 %v1)
-; CHECK-NOT: add i64 {{.+}}, 40
+; CHECK-NOT: getelementptr i8, {{.+}}, 40
   %v2 = load i32, i32* %casted, align 4
   call void @foo(i32 %v2)
   br label %fallthrough
@@ -84,7 +84,7 @@ entry:
 
 if.then:
 ; CHECK-LABEL: if.then:
-; CHECK: add i64 {{.+}}, 40
+; CHECK: getelementptr i8, {{.+}} 40
   %v1 = load i32, i32* %casted, align 4
   call void @foo(i32 %v1)
   %cmp = icmp eq i32 %v1, 0
@@ -95,7 +95,7 @@ fallthrough:
 
 rare.1:
 ; CHECK-LABEL: rare.1:
-; CHECK: add i64 {{.+}}, 40
+; CHECK: getelementptr i8, {{.+}} 40
   call void @slowpath(i32 %v1, i32* %casted) cold
   br label %fallthrough
 }
@@ -111,7 +111,7 @@ entry:
 
 if.then:
 ; CHECK-LABEL: if.then:
-; CHECK-NOT: add i64 {{.+}}, 40
+; CHECK-NOT: getelementptr i8, {{.+}} 40
   %v1 = load i32, i32* %casted, align 4
   call void @foo(i32 %v1)
   %cmp = icmp eq i32 %v1, 0
@@ -136,7 +136,7 @@ entry:
 
 if.then:
 ; CHECK-LABEL: if.then:
-; CHECK-NOT: add i64 {{.+}}, 40
+; CHECK-NOT: getelementptr i8, {{.+}} 40
   %v1 = load i32, i32* %casted, align 4
   call void @foo(i32 %v1)
   %cmp = icmp eq i32 %v1, 0
@@ -162,7 +162,7 @@ entry:
 
 if.then:
 ; CHECK-LABEL: if.then:
-; CHECK: add i64 {{.+}}, 40
+; CHECK: getelementptr i8, {{.+}} 40
   %v1 = load i32, i32* %casted, align 4
   call void @foo(i32 %v1)
   %cmp = icmp eq i32 %v1, 0
@@ -170,7 +170,7 @@ if.then:
 
 next:
 ; CHECK-LABEL: next:
-; CHECK: add i64 {{.+}}, 40
+; CHECK: getelementptr i8, {{.+}} 40
   %v2 = load i32, i32* %casted, align 4
   call void @foo(i32 %v2)
   %cmp2 = icmp eq i32 %v2, 0
@@ -181,13 +181,13 @@ fallthrough:
 
 rare.1:
 ; CHECK-LABEL: rare.1:
-; CHECK: add i64 {{.+}}, 40
+; CHECK: getelementptr i8, {{.+}} 40
   call void @slowpath(i32 %v1, i32* %casted) cold
   br label %next
 
 rare.2:
 ; CHECK-LABEL: rare.2:
-; CHECK: add i64 {{.+}}, 40
+; CHECK: getelementptr i8, {{.+}} 40
   call void @slowpath(i32 %v2, i32* %casted) cold
   br label %fallthrough
 }
diff --git a/test/Transforms/CodeGenPrepare/X86/sink-addrspacecast.ll b/test/Transforms/CodeGenPrepare/X86/sink-addrspacecast.ll
index c9f49b5d4f86..31f0ca239e3a 100644
--- a/test/Transforms/CodeGenPrepare/X86/sink-addrspacecast.ll
+++ b/test/Transforms/CodeGenPrepare/X86/sink-addrspacecast.ll
@@ -1,11 +1,12 @@
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; RUN: opt -S -codegenprepare < %s | FileCheck %s -check-prefix=CHECK -check-prefix=GEP
 
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; CHECK-LABEL: @load_cast_gep
-; CHECK: add i64 %sunkaddr, 40
+; GEP: [[CAST:%[0-9]+]] = addrspacecast i64* %base to i8 addrspace(1)*
+; GEP: getelementptr i8, i8 addrspace(1)* [[CAST]], i64 40
 define void @load_cast_gep(i1 %cond, i64* %base) {
 entry:
   %addr = getelementptr inbounds i64, i64* %base, i64 5
@@ -21,7 +22,8 @@ fallthrough:
 }
 
 ; CHECK-LABEL: @store_gep_cast
-; CHECK: add i64 %sunkaddr, 20
+; GEP: [[CAST:%[0-9]+]] = addrspacecast i64* %base to i8 addrspace(1)*
+; GEP: getelementptr i8, i8 addrspace(1)* [[CAST]], i64 20
 define void @store_gep_cast(i1 %cond, i64* %base) {
 entry:
   %casted = addrspacecast i64* %base to i32 addrspace(1)*
diff --git a/test/Transforms/CodeGenPrepare/basic.ll b/test/Transforms/CodeGenPrepare/basic.ll
index 495d910b5cd6..2e58de7d0934 100644
--- a/test/Transforms/CodeGenPrepare/basic.ll
+++ b/test/Transforms/CodeGenPrepare/basic.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-apple-darwin10.0.0"
 ; rdar://8785296
 define i32 @test1(i8* %ptr) nounwind ssp noredzone align 2 {
 entry:
-  %0 = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %0 = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false, i1 false)
   %1 = icmp ugt i64 %0, 3
   br i1 %1, label %T, label %trap
 
@@ -25,6 +25,44 @@ T:
   ret i32 4
 }
 
-declare i64 @llvm.objectsize.i64(i8*, i1) nounwind readonly
+; CHECK-LABEL: @test_objectsize_null_flag(
+define i64 @test_objectsize_null_flag(i8* %ptr) {
+entry:
+  ; CHECK: ret i64 -1
+  %0 = tail call i64 @llvm.objectsize.i64(i8* null, i1 false, i1 true)
+  ret i64 %0
+}
+
+; CHECK-LABEL: @test_objectsize_null_flag_min(
+define i64 @test_objectsize_null_flag_min(i8* %ptr) {
+entry:
+  ; CHECK: ret i64 0
+  %0 = tail call i64 @llvm.objectsize.i64(i8* null, i1 true, i1 true)
+  ret i64 %0
+}
+
+; Test foldable null pointers because we evaluate them with non-exact modes in
+; CodeGenPrepare.
+; CHECK-LABEL: @test_objectsize_null_flag_noas0(
+define i64 @test_objectsize_null_flag_noas0() {
+entry:
+  ; CHECK: ret i64 0
+  %0 = tail call i64 @llvm.objectsize.i64.p1i8(i8 addrspace(1)* null, i1 false,
+                                               i1 true)
+  ret i64 %0
+}
+
+; CHECK-LABEL: @test_objectsize_null_flag_min_noas0(
+define i64 @test_objectsize_null_flag_min_noas0() {
+entry:
+  ; CHECK: ret i64 0
+  %0 = tail call i64 @llvm.objectsize.i64.p1i8(i8 addrspace(1)* null, i1 true,
+                                               i1 true)
+  ret i64 %0
+}
+
+
+declare i64 @llvm.objectsize.i64(i8*, i1, i1) nounwind readonly
+declare i64 @llvm.objectsize.i64.p1i8(i8 addrspace(1)*, i1, i1) nounwind readonly
 
 declare void @llvm.trap() nounwind
diff --git a/test/Transforms/CodeGenPrepare/builtin-condition.ll b/test/Transforms/CodeGenPrepare/builtin-condition.ll
index 0d41e9e1eddb..e42529a7b9a1 100644
--- a/test/Transforms/CodeGenPrepare/builtin-condition.ll
+++ b/test/Transforms/CodeGenPrepare/builtin-condition.ll
@@ -74,39 +74,39 @@ entry:
   %chararray = alloca [30 x i8], align 16
   %chararray2 = alloca [10 x i8], align 1
   %0 = getelementptr inbounds [30 x i8], [30 x i8]* %chararray, i64 0, i64 0
-  call void @llvm.lifetime.start(i64 30, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 30, i8* %0)
   %1 = getelementptr inbounds [10 x i8], [10 x i8]* %chararray2, i64 0, i64 0
-  call void @llvm.lifetime.start(i64 10, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 10, i8* %1)
   %tobool = icmp eq i32 %flag, 0
   %cptr.0 = select i1 %tobool, i8* %0, i8* %1
   %2 = call i64 @llvm.objectsize.i64.p0i8(i8* %cptr.0, i1 true)
-  call void @llvm.lifetime.end(i64 10, i8* %1)
-  call void @llvm.lifetime.end(i64 30, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 10, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 30, i8* %0)
   ret i64 %2
 ; CHECK-LABEL: foo1
 ; CHECK:  ret i64 10
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
 
 declare i64 @llvm.objectsize.i64.p0i8(i8*, i1)
 
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
 define i64 @foo2(i32 %n) {
 entry:
   %Small = alloca [10 x i8], align 1
   %Large = alloca [20 x i8], align 16
   %0 = getelementptr inbounds [10 x i8], [10 x i8]* %Small, i64 0, i64 0
-  call void @llvm.lifetime.start(i64 10, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 10, i8* %0)
   %1 = getelementptr inbounds [20 x i8], [20 x i8]* %Large, i64 0, i64 0
-  call void @llvm.lifetime.start(i64 20, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 20, i8* %1)
   %tobool = icmp ne i32 %n, 0
   %add.ptr = getelementptr inbounds [20 x i8], [20 x i8]* %Large, i64 0, i64 19
   %cond = select i1 %tobool, i8* %0, i8* %add.ptr
   %2 = call i64 @llvm.objectsize.i64.p0i8(i8* %cond, i1 false)
-  call void @llvm.lifetime.end(i64 20, i8* %1)
-  call void @llvm.lifetime.end(i64 10, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 20, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 10, i8* %0)
   ret i64 %2
 ; CHECK-LABEL: foo2
 ; CHECK:  ret i64 10
diff --git a/test/Transforms/CodeGenPrepare/section.ll b/test/Transforms/CodeGenPrepare/section.ll
index 795c45c220db..2c96612e1baf 100644
--- a/test/Transforms/CodeGenPrepare/section.ll
+++ b/test/Transforms/CodeGenPrepare/section.ll
@@ -5,12 +5,32 @@ target triple = "x86_64-pc-linux-gnu"
 ; This tests that hot/cold functions get correct section prefix assigned
 
 ; CHECK: hot_func{{.*}}!section_prefix ![[HOT_ID:[0-9]+]]
+; The entry is hot
 define void @hot_func() !prof !15 {
   ret void
 }
 
+; CHECK: hot_call_func{{.*}}!section_prefix ![[HOT_ID]]
+; The sum of 2 callsites are hot
+define void @hot_call_func() !prof !16 {
+  call void @hot_func(), !prof !17
+  call void @hot_func(), !prof !17
+  ret void
+}
+
+; CHECK-NOT: normal_func{{.*}}!section_prefix
+; The sum of all callsites are neither hot or cold
+define void @normal_func() !prof !16 {
+  call void @hot_func(), !prof !17
+  call void @hot_func(), !prof !18
+  call void @hot_func(), !prof !18
+  ret void
+}
+
 ; CHECK: cold_func{{.*}}!section_prefix ![[COLD_ID:[0-9]+]]
+; The entry and the callsite are both cold
 define void @cold_func() !prof !16 {
+  call void @hot_func(), !prof !18
   ret void
 }
 
@@ -33,3 +53,5 @@ define void @cold_func() !prof !16 {
 !14 = !{i32 999999, i64 1, i32 2}
 !15 = !{!"function_entry_count", i64 1000}
 !16 = !{!"function_entry_count", i64 1}
+!17 = !{!"branch_weights", i32 80}
+!18 = !{!"branch_weights", i32 1}
diff --git a/test/Transforms/ConstProp/loads.ll b/test/Transforms/ConstProp/loads.ll
index 89387ad06ba8..dce2068a8d55 100644
--- a/test/Transforms/ConstProp/loads.ll
+++ b/test/Transforms/ConstProp/loads.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -default-data-layout="e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64" -instcombine -S | FileCheck %s --check-prefix=LE
-; RUN: opt < %s -default-data-layout="E-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64" -instcombine -S | FileCheck %s --check-prefix=BE
+; RUN: opt < %s -data-layout="e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64" -instcombine -S | FileCheck %s --check-prefix=LE
+; RUN: opt < %s -data-layout="E-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64" -instcombine -S | FileCheck %s --check-prefix=BE
 
 ; {{ 0xDEADBEEF, 0xBA }, 0xCAFEBABE}
 @g1 = constant {{i32,i8},i32} {{i32,i8} { i32 -559038737, i8 186 }, i32 -889275714 }
diff --git a/test/Transforms/ConstantHoisting/X86/ehpad.ll b/test/Transforms/ConstantHoisting/X86/ehpad.ll
new file mode 100644
index 000000000000..3178e87f7548
--- /dev/null
+++ b/test/Transforms/ConstantHoisting/X86/ehpad.ll
@@ -0,0 +1,62 @@
+; RUN: opt -S -consthoist < %s | FileCheck %s
+
+; FIXME: The catchpad doesn't even use the constant, so a better fix would be to
+; insert the bitcast in the catchpad block.
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+; CHECK-LABEL: define i32 @main
+; CHECK: %tobool = icmp eq i32 %argc, 0
+; CHECK-NEXT: bitcast i64 9209618997431186100 to i64
+; CHECK-NEXT: br i1 %tobool
+
+; Function Attrs: norecurse
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) local_unnamed_addr #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+  %call = tail call i64 @fn(i64 0)
+  %call1 = tail call i64 @fn(i64 1)
+  %tobool = icmp eq i32 %argc, 0
+  br i1 %tobool, label %2, label %1
+
+; <label>:1:                                      ; preds = %0
+  %call2 = invoke i64 @fn(i64 %call)
+          to label %6 unwind label %catch.dispatch
+
+; <label>:2:                                      ; preds = %0
+  %call3 = invoke i64 @fn(i64 %call1)
+          to label %6 unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %2, %1
+  %z.0 = phi i64 [ %call, %1 ], [ %call1, %2 ]
+  %3 = catchswitch within none [label %4] unwind to caller
+
+; <label>:4:                                      ; preds = %catch.dispatch
+  %5 = catchpad within %3 [i8* null, i32 64, i8* null]
+  br i1 %tobool, label %then, label %else
+
+then:
+  %call4 = tail call i64 @fn(i64 %z.0) [ "funclet"(token %5) ]
+  %add = add i64 %call4, 9209618997431186100
+  br label %endif
+
+else:
+  %call5 = tail call i64 @fn(i64 0) [ "funclet"(token %5) ]
+  %add6 = add i64 %call5, 9209618997431186100
+  br label %endif
+
+endif:
+  %v = phi i64 [ %add, %then ], [ %add6, %else ]
+  %call7 = tail call i64 @fn(i64 %v) [ "funclet"(token %5) ]
+  %call8 = tail call i64 @fn(i64 %call7) [ "funclet"(token %5) ]
+  catchret from %5 to label %6
+
+; <label>:6:                                      ; preds = %1, %2, %4
+  ret i32 0
+}
+
+declare i64 @fn(i64) local_unnamed_addr #1
+
+declare i32 @__CxxFrameHandler3(...)
+
+attributes #0 = { norecurse "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/Transforms/ConstantMerge/dont-merge.ll b/test/Transforms/ConstantMerge/dont-merge.ll
index e5337dff27df..21e390785df5 100644
--- a/test/Transforms/ConstantMerge/dont-merge.ll
+++ b/test/Transforms/ConstantMerge/dont-merge.ll
@@ -42,3 +42,41 @@ define void @test3() {
   call void asm sideeffect "T3A, T3B",""() ; invisible use of T3A and T3B
   ret void
 }
+
+; Don't merge constants with !type annotations.
+
+@T4A1 = internal constant i32 2, !type !0
+@T4A2 = internal unnamed_addr constant i32 2, !type !1
+
+@T4B1 = internal constant i32 3, !type !0
+@T4B2 = internal unnamed_addr constant i32 3, !type !0
+
+@T4C1 = internal constant i32 4, !type !0
+@T4C2 = unnamed_addr constant i32 4
+
+@T4D1 = unnamed_addr constant i32 5, !type !0
+@T4D2 = internal constant i32 5
+
+!0 = !{i64 0, !"typeinfo name for A"}
+!1 = !{i64 0, !"typeinfo name for B"}
+
+; CHECK: @T4A1
+; CHECK: @T4A2
+; CHECK: @T4B1
+; CHECK: @T4B2
+; CHECK: @T4C1
+; CHECK: @T4C2
+; CHECK: @T4D1
+; CHECK: @T4D2
+
+define void @test4(i32** %P1, i32** %P2, i32** %P3, i32** %P4, i32** %P5, i32** %P6, i32** %P7, i32** %P8) {
+        store i32* @T4A1, i32** %P1
+        store i32* @T4A2, i32** %P2
+        store i32* @T4B1, i32** %P3
+        store i32* @T4B2, i32** %P4
+        store i32* @T4C1, i32** %P5
+        store i32* @T4C2, i32** %P6
+        store i32* @T4D1, i32** %P7
+        store i32* @T4D2, i32** %P8
+        ret void
+}
diff --git a/test/Transforms/ConstantMerge/merge-dbg.ll b/test/Transforms/ConstantMerge/merge-dbg.ll
new file mode 100644
index 000000000000..bc33248514e0
--- /dev/null
+++ b/test/Transforms/ConstantMerge/merge-dbg.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s -constmerge -S | FileCheck %s
+
+; CHECK: = constant i32 1, !dbg [[A:![0-9]+]], !dbg [[B:![0-9]+]]
+@a = internal constant i32 1, !dbg !0
+@b = unnamed_addr constant i32 1, !dbg !9
+
+define void @test1(i32** %P1, i32** %P2) {
+  store i32* @a, i32** %P1
+  store i32* @b, i32** %P2
+  ret void
+}
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!7, !8}
+
+; CHECK: [[A]] = !DIGlobalVariableExpression(var: [[VA:![0-9]+]])
+; CHECK: [[VA]] = distinct !DIGlobalVariable(name: "y"
+; CHECK: [[B]] = !DIGlobalVariableExpression(var: [[VB:![0-9]+]])
+; CHECK: [[VB]] = distinct !DIGlobalVariable(name: "x"
+
+!0 = !DIGlobalVariableExpression(var: !1)
+!1 = distinct !DIGlobalVariable(name: "x", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang version 5.0.0 (trunk 297227) (llvm/trunk 297234)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
+!3 = !DIFile(filename: "1.cc", directory: "/build")
+!4 = !{}
+!5 = !{!0}
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+
+!9 = !DIGlobalVariableExpression(var: !10)
+!10 = distinct !DIGlobalVariable(name: "y", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true)
diff --git a/test/Transforms/Coroutines/ArgAddr.ll b/test/Transforms/Coroutines/ArgAddr.ll
index 4bedb510cd9e..5d0fbd781be9 100644
--- a/test/Transforms/Coroutines/ArgAddr.ll
+++ b/test/Transforms/Coroutines/ArgAddr.ll
@@ -32,7 +32,7 @@ coro_Cleanup:
   br label %coro_Suspend
 
 coro_Suspend:
-  call void @llvm.coro.end(i8* null, i1 false)
+  call i1 @llvm.coro.end(i8* null, i1 false)
   ret i8* %1
 }
 
@@ -61,7 +61,7 @@ declare i32 @llvm.coro.size.i32()
 declare i8* @llvm.coro.begin(token, i8*)
 declare i8 @llvm.coro.suspend(token, i1)
 declare i8* @llvm.coro.free(token, i8*)
-declare void @llvm.coro.end(i8*, i1)
+declare i1 @llvm.coro.end(i8*, i1)
 
 declare void @llvm.coro.resume(i8*)
 declare void @llvm.coro.destroy(i8*)
diff --git a/test/Transforms/Coroutines/coro-frame.ll b/test/Transforms/Coroutines/coro-frame.ll
new file mode 100644
index 000000000000..001012fcd0c9
--- /dev/null
+++ b/test/Transforms/Coroutines/coro-frame.ll
@@ -0,0 +1,61 @@
+; Check that we can handle spills of the result of the invoke instruction
+; RUN: opt < %s -coro-split -S | FileCheck %s
+
+define i8* @f() "coroutine.presplit"="1" personality i32 0 {
+entry:
+  %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
+  %size = call i32 @llvm.coro.size.i32()
+  %alloc = call i8* @malloc(i32 %size)
+  %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc)
+  %r = invoke double @print(double 0.0) to label %cont unwind label %pad
+
+cont:
+  %0 = call i8 @llvm.coro.suspend(token none, i1 false)
+  switch i8 %0, label %suspend [i8 0, label %resume
+                                i8 1, label %cleanup]
+resume:
+  call double @print(double %r)
+  br label %cleanup
+
+cleanup:
+  %mem = call i8* @llvm.coro.free(token %id, i8* %hdl)
+  call void @free(i8* %mem)
+  br label %suspend
+suspend:
+  call i1 @llvm.coro.end(i8* %hdl, i1 0)
+  ret i8* %hdl
+pad:
+  %tok = cleanuppad within none []
+  cleanupret from %tok unwind to caller
+}
+
+; See if the float was added to the frame
+; CHECK-LABEL: %f.Frame = type { void (%f.Frame*)*, void (%f.Frame*)*, i1, i1, double }
+
+; See if the float was spilled into the frame
+; CHECK-LABEL: @f(
+; CHECK: %r = call double @print(
+; CHECK: %r.spill.addr = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 4
+; CHECK: store double %r, double* %r.spill.addr
+; CHECK: ret i8* %hdl
+
+; See of the float was loaded from the frame
+; CHECK-LABEL: @f.resume(
+; CHECK: %r.reload = load double, double* %r.reload.addr
+; CHECK: call double @print(double %r.reload)
+; CHECK: ret void
+
+declare i8* @llvm.coro.free(token, i8*)
+declare i32 @llvm.coro.size.i32()
+declare i8  @llvm.coro.suspend(token, i1)
+declare void @llvm.coro.resume(i8*)
+declare void @llvm.coro.destroy(i8*)
+
+declare token @llvm.coro.id(i32, i8*, i8*, i8*)
+declare i1 @llvm.coro.alloc(token)
+declare i8* @llvm.coro.begin(token, i8*)
+declare i1 @llvm.coro.end(i8*, i1)
+
+declare noalias i8* @malloc(i32)
+declare double @print(double)
+declare void @free(i8*)
diff --git a/test/Transforms/Coroutines/coro-spill-after-phi.ll b/test/Transforms/Coroutines/coro-spill-after-phi.ll
new file mode 100644
index 000000000000..3c7e050c09e9
--- /dev/null
+++ b/test/Transforms/Coroutines/coro-spill-after-phi.ll
@@ -0,0 +1,60 @@
+; Verifies that we insert spills of PHI instruction _after) all PHI Nodes
+; RUN: opt < %s -coro-split -S | FileCheck %s
+
+define i8* @f(i1 %n) "coroutine.presplit"="1" {
+entry:
+  %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
+  %size = call i32 @llvm.coro.size.i32()
+  %alloc = call i8* @malloc(i32 %size)
+  %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc)
+  br i1 %n, label %begin, label %alt
+alt:
+  br label %begin
+
+begin:
+  %phi1 = phi i32 [ 0, %entry ], [ 2, %alt ]
+  %phi2 = phi i32 [ 1, %entry ], [ 3, %alt ]
+
+  %sp1 = call i8 @llvm.coro.suspend(token none, i1 false)
+  switch i8 %sp1, label %suspend [i8 0, label %resume
+                                  i8 1, label %cleanup]
+resume:
+  call i32 @print(i32 %phi1)
+  call i32 @print(i32 %phi2)
+  br label %cleanup
+
+cleanup:
+  %mem = call i8* @llvm.coro.free(token %id, i8* %hdl)
+  call void @free(i8* %mem)
+  br label %suspend
+suspend:
+  call i1 @llvm.coro.end(i8* %hdl, i1 0)
+  ret i8* %hdl
+}
+
+; Verifies that the both phis are stored correctly in the coroutine frame
+; CHECK: %f.Frame = type { void (%f.Frame*)*, void (%f.Frame*)*, i1, i1, i32, i32 }
+; CHECK-LABEL: @f(
+; CHECK: store void (%f.Frame*)* @f.destroy, void (%f.Frame*)** %destroy.addr
+; CHECK: %phi1 = select i1 %n, i32 0, i32 2
+; CHECK: %phi2 = select i1 %n, i32 1, i32 3
+; CHECK: %phi2.spill.addr = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 5
+; CHECK: store i32 %phi2, i32* %phi2.spill.addr
+; CHECK: %phi1.spill.addr = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 4
+; CHECK: store i32 %phi1, i32* %phi1.spill.addr
+; CHECK: ret i8* %hdl
+
+declare i8* @llvm.coro.free(token, i8*)
+declare i32 @llvm.coro.size.i32()
+declare i8  @llvm.coro.suspend(token, i1)
+declare void @llvm.coro.resume(i8*)
+declare void @llvm.coro.destroy(i8*)
+
+declare token @llvm.coro.id(i32, i8*, i8*, i8*)
+declare i1 @llvm.coro.alloc(token)
+declare i8* @llvm.coro.begin(token, i8*)
+declare i1 @llvm.coro.end(i8*, i1)
+
+declare noalias i8* @malloc(i32)
+declare i32 @print(i32)
+declare void @free(i8*)
diff --git a/test/Transforms/Coroutines/coro-split-00.ll b/test/Transforms/Coroutines/coro-split-00.ll
index 12aec27b2fe6..0461b7dddb6c 100644
--- a/test/Transforms/Coroutines/coro-split-00.ll
+++ b/test/Transforms/Coroutines/coro-split-00.ll
@@ -28,7 +28,7 @@ cleanup:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 0)  
+  call i1 @llvm.coro.end(i8* %hdl, i1 0)  
   ret i8* %hdl
 }
 
@@ -72,7 +72,7 @@ declare void @llvm.coro.destroy(i8*)
 declare token @llvm.coro.id(i32, i8*, i8*, i8*)
 declare i1 @llvm.coro.alloc(token)
 declare i8* @llvm.coro.begin(token, i8*)
-declare void @llvm.coro.end(i8*, i1) 
+declare i1 @llvm.coro.end(i8*, i1) 
 
 declare noalias i8* @malloc(i32)
 declare void @print(i32)
diff --git a/test/Transforms/Coroutines/coro-split-01.ll b/test/Transforms/Coroutines/coro-split-01.ll
index 2b5801f7ddd1..cff2e9ca6f0a 100644
--- a/test/Transforms/Coroutines/coro-split-01.ll
+++ b/test/Transforms/Coroutines/coro-split-01.ll
@@ -26,7 +26,7 @@ cleanup:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 0)  
+  call i1 @llvm.coro.end(i8* %hdl, i1 0)  
   ret i8* %hdl
 }
 define i32 @main() {
@@ -49,7 +49,7 @@ declare void @llvm.coro.destroy(i8*)
 declare token @llvm.coro.id(i32, i8*, i8*, i8*)
 declare i1 @llvm.coro.alloc(token)
 declare i8* @llvm.coro.begin(token, i8*)
-declare void @llvm.coro.end(i8*, i1) 
+declare i1 @llvm.coro.end(i8*, i1) 
 
 declare noalias i8* @malloc(i32)
 declare void @print(i32)
diff --git a/test/Transforms/Coroutines/coro-split-02.ll b/test/Transforms/Coroutines/coro-split-02.ll
index 2326f77f1987..953c25088652 100644
--- a/test/Transforms/Coroutines/coro-split-02.ll
+++ b/test/Transforms/Coroutines/coro-split-02.ll
@@ -28,7 +28,7 @@ await.ready:
   call void @print(i32 %val)
   br label %exit  
 exit:
-  call void @llvm.coro.end(i8* null, i1 false)
+  call i1 @llvm.coro.end(i8* null, i1 false)
   ret void
 }
 
@@ -50,5 +50,5 @@ declare i8* @llvm.coro.frame() #5
 declare i8 @llvm.coro.suspend(token, i1) #3
 declare void @"\01??3@YAXPEAX@Z"(i8*) local_unnamed_addr #10
 declare i8* @llvm.coro.free(token, i8* nocapture readonly) #2
-declare void @llvm.coro.end(i8*, i1) #3
+declare i1 @llvm.coro.end(i8*, i1) #3
 
diff --git a/test/Transforms/Coroutines/coro-split-dbg.ll b/test/Transforms/Coroutines/coro-split-dbg.ll
index 293622c40ebd..80f706879e55 100644
--- a/test/Transforms/Coroutines/coro-split-dbg.ll
+++ b/test/Transforms/Coroutines/coro-split-dbg.ll
@@ -38,12 +38,12 @@ coro_Cleanup:                                     ; preds = %for.cond
   br label %coro_Suspend, !dbg !36
 
 coro_Suspend:                                     ; preds = %for.cond, %if.then, %coro_Cleanup
-  tail call void @llvm.coro.end(i8* null, i1 false) #9, !dbg !38
+  tail call i1 @llvm.coro.end(i8* null, i1 false) #9, !dbg !38
   ret i8* %2, !dbg !39
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #4
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #4
 
 ; Function Attrs: argmemonly nounwind readonly
 declare token @llvm.coro.id(i32, i8* readnone, i8* nocapture readonly, i8*) #5
@@ -54,10 +54,10 @@ declare i64 @llvm.coro.size.i64() #1
 declare i8* @llvm.coro.begin(token, i8* writeonly) #7
 declare token @llvm.coro.save(i8*) #7
 declare i8 @llvm.coro.suspend(token, i1) #7
-declare void @llvm.lifetime.end(i64, i8* nocapture) #4
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #4
 declare i8* @llvm.coro.free(token, i8* nocapture readonly) #5
 declare void @free(i8* nocapture) local_unnamed_addr #6
-declare void @llvm.coro.end(i8*, i1) #7
+declare i1 @llvm.coro.end(i8*, i1) #7
 declare i8* @llvm.coro.subfn.addr(i8* nocapture readonly, i8) #5
 
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
diff --git a/test/Transforms/Coroutines/coro-split-eh.ll b/test/Transforms/Coroutines/coro-split-eh.ll
new file mode 100644
index 000000000000..7fc97e261e81
--- /dev/null
+++ b/test/Transforms/Coroutines/coro-split-eh.ll
@@ -0,0 +1,145 @@
+; Tests that coro-split removes cleanup code after coro.end in resume functions
+; and retains it in the start function.
+; RUN: opt < %s -coro-split -S | FileCheck %s
+
+define i8* @f(i1 %val) "coroutine.presplit"="1" personality i32 3 {
+entry:
+  %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
+  %hdl = call i8* @llvm.coro.begin(token %id, i8* null)
+  call void @print(i32 0)
+  br i1 %val, label %resume, label %susp
+
+susp:  
+  %0 = call i8 @llvm.coro.suspend(token none, i1 false)
+  switch i8 %0, label %suspend [i8 0, label %resume 
+                                i8 1, label %suspend]
+resume:
+  invoke void @print(i32 1) to label %suspend unwind label %lpad
+
+suspend:
+  call i1 @llvm.coro.end(i8* %hdl, i1 0)  
+  call void @print(i32 0) ; should not be present in f.resume
+  ret i8* %hdl
+
+lpad:
+  %lpval = landingpad { i8*, i32 }
+     cleanup
+
+  call void @print(i32 2)
+  %need.resume = call i1 @llvm.coro.end(i8* null, i1 true)
+  br i1 %need.resume, label %eh.resume, label %cleanup.cont
+
+cleanup.cont:
+  call void @print(i32 3) ; should not be present in f.resume
+  br label %eh.resume
+
+eh.resume:
+  resume { i8*, i32 } %lpval
+}
+
+; Verify that start function contains both print calls the one before and after coro.end
+; CHECK-LABEL: define i8* @f(
+; CHECK: invoke void @print(i32 1)
+; CHECK:   to label %AfterCoroEnd unwind label %lpad
+
+; CHECK: AfterCoroEnd:
+; CHECK:   call void @print(i32 0)
+; CHECK:   ret i8* %hdl
+
+; CHECK:         lpad:
+; CHECK-NEXT:      %lpval = landingpad { i8*, i32 }
+; CHECK-NEXT:         cleanup
+; CHECK-NEXT:      call void @print(i32 2)
+; CHECK-NEXT:      call void @print(i32 3)
+; CHECK-NEXT:      resume { i8*, i32 } %lpval
+
+define i8* @f2(i1 %val) "coroutine.presplit"="1" personality i32 4 {
+entry:
+  %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
+  %hdl = call i8* @llvm.coro.begin(token %id, i8* null)
+  call void @print(i32 0)
+  br i1 %val, label %resume, label %susp
+
+susp:  
+  %0 = call i8 @llvm.coro.suspend(token none, i1 false)
+  switch i8 %0, label %suspend [i8 0, label %resume 
+                                i8 1, label %suspend]
+resume:
+  invoke void @print(i32 1) to label %suspend unwind label %lpad
+
+suspend:
+  call i1 @llvm.coro.end(i8* %hdl, i1 0)  
+  call void @print(i32 0) ; should not be present in f.resume
+  ret i8* %hdl
+
+lpad:
+  %tok = cleanuppad within none []
+  call void @print(i32 2)
+  %unused = call i1 @llvm.coro.end(i8* null, i1 true) [ "funclet"(token %tok) ]
+  cleanupret from %tok unwind label %cleanup.cont
+
+cleanup.cont:
+  %tok2 = cleanuppad within none []
+  call void @print(i32 3) ; should not be present in f.resume
+  cleanupret from %tok2 unwind to caller 
+}
+
+; Verify that start function contains both print calls the one before and after coro.end
+; CHECK-LABEL: define i8* @f2(
+; CHECK: invoke void @print(i32 1)
+; CHECK:   to label %AfterCoroEnd unwind label %lpad
+
+; CHECK: AfterCoroEnd:
+; CHECK:   call void @print(i32 0)
+; CHECK:   ret i8* %hdl
+
+; CHECK:      lpad:
+; CHECK-NEXT:   %tok = cleanuppad within none []
+; CHECK-NEXT:   call void @print(i32 2)
+; CHECK-NEXT:   call void @print(i32 3)
+; CHECK-NEXT:   cleanupret from %tok unwind to caller
+
+; VERIFY Resume Parts
+
+; Verify that resume function does not contains both print calls appearing after coro.end
+; CHECK-LABEL: define internal fastcc void @f.resume
+; CHECK: invoke void @print(i32 1)
+; CHECK:   to label %CoroEnd unwind label %lpad
+
+; CHECK:      CoroEnd:
+; CHECK-NEXT:   ret void
+
+; CHECK:         lpad:
+; CHECK-NEXT:      %lpval = landingpad { i8*, i32 }
+; CHECK-NEXT:         cleanup
+; CHECK-NEXT:      call void @print(i32 2)
+; CHECK-NEXT:      resume { i8*, i32 } %lpval
+
+; Verify that resume function does not contains both print calls appearing after coro.end
+; CHECK-LABEL: define internal fastcc void @f2.resume
+; CHECK: invoke void @print(i32 1)
+; CHECK:   to label %CoroEnd unwind label %lpad
+
+; CHECK:      CoroEnd:
+; CHECK-NEXT:   ret void
+
+; CHECK:      lpad:
+; CHECK-NEXT:   %tok = cleanuppad within none []
+; CHECK-NEXT:   call void @print(i32 2)
+; CHECK-NEXT:   cleanupret from %tok unwind to caller
+
+declare i8* @llvm.coro.free(token, i8*)
+declare i32 @llvm.coro.size.i32()
+declare i8  @llvm.coro.suspend(token, i1)
+declare void @llvm.coro.resume(i8*)
+declare void @llvm.coro.destroy(i8*)
+
+declare token @llvm.coro.id(i32, i8*, i8*, i8*)
+declare i8* @llvm.coro.alloc(token)
+declare i8* @llvm.coro.begin(token, i8*)
+declare i1 @llvm.coro.end(i8*, i1) 
+
+declare noalias i8* @malloc(i32)
+declare void @print(i32)
+declare void @free(i8*)
+
diff --git a/test/Transforms/Coroutines/ex0.ll b/test/Transforms/Coroutines/ex0.ll
index d4a9f941d838..59bebc546649 100644
--- a/test/Transforms/Coroutines/ex0.ll
+++ b/test/Transforms/Coroutines/ex0.ll
@@ -24,7 +24,7 @@ cleanup:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 0)  
+  call i1 @llvm.coro.end(i8* %hdl, i1 0)  
   ret i8* %hdl
 }
 
@@ -52,7 +52,7 @@ declare void @llvm.coro.resume(i8*)
 declare void @llvm.coro.destroy(i8*)
   
 declare i8* @llvm.coro.begin(token, i8*)
-declare void @llvm.coro.end(i8*, i1) 
+declare i1 @llvm.coro.end(i8*, i1) 
 
 declare noalias i8* @malloc(i32)
 declare void @print(i32)
diff --git a/test/Transforms/Coroutines/ex1.ll b/test/Transforms/Coroutines/ex1.ll
index 86ac75b13404..c2a5586fde58 100644
--- a/test/Transforms/Coroutines/ex1.ll
+++ b/test/Transforms/Coroutines/ex1.ll
@@ -20,7 +20,7 @@ cleanup:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 false)
+  call i1 @llvm.coro.end(i8* %hdl, i1 false)
   ret i8* %hdl
 }
 
@@ -48,7 +48,7 @@ declare i32 @llvm.coro.size.i32()
 declare i8* @llvm.coro.begin(token, i8*)
 declare i8 @llvm.coro.suspend(token, i1)
 declare i8* @llvm.coro.free(token, i8*)
-declare void @llvm.coro.end(i8*, i1)
+declare i1 @llvm.coro.end(i8*, i1)
 
 declare void @llvm.coro.resume(i8*)
 declare void @llvm.coro.destroy(i8*)
diff --git a/test/Transforms/Coroutines/ex2.ll b/test/Transforms/Coroutines/ex2.ll
index 8681e4cecc80..6987d2a4c9fd 100644
--- a/test/Transforms/Coroutines/ex2.ll
+++ b/test/Transforms/Coroutines/ex2.ll
@@ -29,7 +29,7 @@ dyn.free:
   call void @CustomFree(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 false)
+  call i1 @llvm.coro.end(i8* %hdl, i1 false)
   ret i8* %hdl
 }
 
@@ -57,7 +57,7 @@ declare i32 @llvm.coro.size.i32()
 declare i8* @llvm.coro.begin(token, i8*)
 declare i8 @llvm.coro.suspend(token, i1)
 declare i8* @llvm.coro.free(token, i8*)
-declare void @llvm.coro.end(i8*, i1)
+declare i1 @llvm.coro.end(i8*, i1)
 
 declare void @llvm.coro.resume(i8*)
 declare void @llvm.coro.destroy(i8*)
diff --git a/test/Transforms/Coroutines/ex3.ll b/test/Transforms/Coroutines/ex3.ll
index 13289c8e974a..8ff4d718230f 100644
--- a/test/Transforms/Coroutines/ex3.ll
+++ b/test/Transforms/Coroutines/ex3.ll
@@ -26,7 +26,7 @@ cleanup:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 false)
+  call i1 @llvm.coro.end(i8* %hdl, i1 false)
   ret i8* %hdl
 }
 
@@ -54,7 +54,7 @@ declare i32 @llvm.coro.size.i32()
 declare i8* @llvm.coro.begin(token, i8*)
 declare i8 @llvm.coro.suspend(token, i1)
 declare i8* @llvm.coro.free(token, i8*)
-declare void @llvm.coro.end(i8*, i1)
+declare i1 @llvm.coro.end(i8*, i1)
 
 declare void @llvm.coro.resume(i8*)
 declare void @llvm.coro.destroy(i8*)
diff --git a/test/Transforms/Coroutines/ex4.ll b/test/Transforms/Coroutines/ex4.ll
index ce896ad7ee41..4992052acd2e 100644
--- a/test/Transforms/Coroutines/ex4.ll
+++ b/test/Transforms/Coroutines/ex4.ll
@@ -28,7 +28,7 @@ cleanup:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 false)
+  call i1 @llvm.coro.end(i8* %hdl, i1 false)
   ret i8* %hdl
 }
 
@@ -65,7 +65,7 @@ declare i32 @llvm.coro.size.i32()
 declare i8* @llvm.coro.begin(token, i8*)
 declare i8 @llvm.coro.suspend(token, i1)
 declare i8* @llvm.coro.free(token, i8*)
-declare void @llvm.coro.end(i8*, i1)
+declare i1 @llvm.coro.end(i8*, i1)
 
 declare void @llvm.coro.resume(i8*)
 declare void @llvm.coro.destroy(i8*)
diff --git a/test/Transforms/Coroutines/ex5.ll b/test/Transforms/Coroutines/ex5.ll
index c9772825f250..34767584c811 100644
--- a/test/Transforms/Coroutines/ex5.ll
+++ b/test/Transforms/Coroutines/ex5.ll
@@ -31,7 +31,7 @@ cleanup:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 false)
+  call i1 @llvm.coro.end(i8* %hdl, i1 false)
   ret i8* %hdl
 }
 
@@ -46,7 +46,7 @@ declare i8* @llvm.coro.begin(token, i8*)
 declare token @llvm.coro.save(i8*)
 declare i8 @llvm.coro.suspend(token, i1)
 declare i8* @llvm.coro.free(token, i8*)
-declare void @llvm.coro.end(i8*, i1)
+declare i1 @llvm.coro.end(i8*, i1)
 
 ; CHECK-LABEL: @main
 define i32 @main() {
diff --git a/test/Transforms/Coroutines/no-suspend.ll b/test/Transforms/Coroutines/no-suspend.ll
index d219495de6cc..804b38cc1abe 100644
--- a/test/Transforms/Coroutines/no-suspend.ll
+++ b/test/Transforms/Coroutines/no-suspend.ll
@@ -32,7 +32,7 @@ dyn.free:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 false)
+  call i1 @llvm.coro.end(i8* %hdl, i1 false)
   ret void
 }
 
@@ -77,7 +77,7 @@ cleanup:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 false)
+  call i1 @llvm.coro.end(i8* %hdl, i1 false)
   ret void
 }
 
@@ -122,7 +122,7 @@ cleanup:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 false)
+  call i1 @llvm.coro.end(i8* %hdl, i1 false)
   ret void
 }
 
@@ -167,7 +167,7 @@ cleanup:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 false)
+  call i1 @llvm.coro.end(i8* %hdl, i1 false)
   ret void
 }
 
@@ -183,7 +183,7 @@ declare i8* @llvm.coro.begin(token, i8*)
 declare token @llvm.coro.save(i8* %hdl)
 declare i8 @llvm.coro.suspend(token, i1)
 declare i8* @llvm.coro.free(token, i8*)
-declare void @llvm.coro.end(i8*, i1)
+declare i1 @llvm.coro.end(i8*, i1)
 
 declare void @llvm.coro.resume(i8*)
 declare void @llvm.coro.destroy(i8*)
diff --git a/test/Transforms/Coroutines/phi-coro-end.ll b/test/Transforms/Coroutines/phi-coro-end.ll
index e2529412e72c..f99990cf33bc 100644
--- a/test/Transforms/Coroutines/phi-coro-end.ll
+++ b/test/Transforms/Coroutines/phi-coro-end.ll
@@ -17,7 +17,7 @@ cleanup:
 
 suspend:
   %r = phi i32 [%n, %entry], [1, %cleanup]
-  call void @llvm.coro.end(i8* %hdl, i1 false)  
+  call i1 @llvm.coro.end(i8* %hdl, i1 false)  
   call void @print(i32 %r)
   ret i8* %hdl
 }
@@ -41,7 +41,7 @@ declare void @llvm.coro.destroy(i8*)
   
 declare token @llvm.coro.id(i32, i8*, i8*, i8*)
 declare i8* @llvm.coro.begin(token, i8*)
-declare void @llvm.coro.end(i8*, i1) 
+declare i1 @llvm.coro.end(i8*, i1) 
 
 declare noalias i8* @malloc(i32)
 declare void @print(i32)
diff --git a/test/Transforms/Coroutines/restart-trigger.ll b/test/Transforms/Coroutines/restart-trigger.ll
index 2240f8fa6323..f7f203f2fb5c 100644
--- a/test/Transforms/Coroutines/restart-trigger.ll
+++ b/test/Transforms/Coroutines/restart-trigger.ll
@@ -25,7 +25,7 @@ cleanup:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 0)
+  call i1 @llvm.coro.end(i8* %hdl, i1 0)
   ret void  
 }
 
@@ -36,7 +36,7 @@ declare i32 @llvm.coro.size.i32()
 declare i8  @llvm.coro.suspend(token, i1)
 declare void @llvm.coro.resume(i8*)
 declare void @llvm.coro.destroy(i8*)
-declare void @llvm.coro.end(i8*, i1) 
+declare i1 @llvm.coro.end(i8*, i1) 
 
 declare noalias i8* @malloc(i32)
 declare void @print(i32)
diff --git a/test/Transforms/CorrelatedValuePropagation/alloca.ll b/test/Transforms/CorrelatedValuePropagation/alloca.ll
index 0a6ba675a477..37b27b29445c 100644
--- a/test/Transforms/CorrelatedValuePropagation/alloca.ll
+++ b/test/Transforms/CorrelatedValuePropagation/alloca.ll
@@ -13,14 +13,14 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [8 x i8] c"a = %l\0A\00", align 1
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
 
 declare void @hoo(i64*)
 
 declare i32 @printf(i8* nocapture readonly, ...)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
 define void @goo(i32 %N, i64* %b) {
 entry:
@@ -35,12 +35,12 @@ for.cond:                                         ; preds = %for.body, %entry
   br i1 %cmp, label %for.body, label %for.end
 
 for.body:                                         ; preds = %for.cond
-  call void @llvm.lifetime.start(i64 8, i8* %tmp)
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* %tmp)
   call void @hoo(i64* %a.i)
   call void @hoo(i64* %c)
   %tmp1 = load volatile i64, i64* %a.i, align 8
   %call.i = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i64 0, i64 0), i64 %tmp1)
-  call void @llvm.lifetime.end(i64 8, i8* %tmp)
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* %tmp)
   %inc = add nsw i32 %i.0, 1
   br label %for.cond
 
diff --git a/test/Transforms/CorrelatedValuePropagation/basic.ll b/test/Transforms/CorrelatedValuePropagation/basic.ll
index 9836c7f80778..14b9a1999cc3 100644
--- a/test/Transforms/CorrelatedValuePropagation/basic.ll
+++ b/test/Transforms/CorrelatedValuePropagation/basic.ll
@@ -115,9 +115,9 @@ negative:
     i32 1, label %out
 ; CHECK-NOT: i32 1
     i32 -1, label %next
-; CHECK: i32 -1, label %next
+; CHECK-DAG: i32 -1, label %next
     i32 -2, label %next
-; CHECK: i32 -2, label %next
+; CHECK-DAG: i32 -2, label %next
     i32 2, label %out
 ; CHECK-NOT: i32 2
     i32 3, label %out
diff --git a/test/Transforms/DeadArgElim/call_profile.ll b/test/Transforms/DeadArgElim/call_profile.ll
new file mode 100644
index 000000000000..6acb6f000dbe
--- /dev/null
+++ b/test/Transforms/DeadArgElim/call_profile.ll
@@ -0,0 +1,22 @@
+; RUN: opt -deadargelim -S < %s | FileCheck %s
+
+; Checks if !prof metadata is corret in deadargelim.
+
+define void @caller() #0 {
+; CHECK: call void @test_vararg(), !prof ![[PROF:[0-9]]]
+; CHECK: call void @test(), !prof ![[PROF]]
+  call void (i32, ...) @test_vararg(i32 1), !prof !0
+  call void @test(i32 1), !prof !0
+  ret void
+}
+
+define internal void @test_vararg(i32, ...) #1 {
+  ret void
+}
+
+define internal void @test(i32 %a) #1 {
+  ret void
+}
+
+; CHECK:![[PROF]] = !{!"branch_weights", i32 30}
+!0 = !{!"branch_weights", i32 30}
diff --git a/test/Transforms/DeadStoreElimination/dominate.ll b/test/Transforms/DeadStoreElimination/dominate.ll
index 638992bae729..24dd65e07bbc 100644
--- a/test/Transforms/DeadStoreElimination/dominate.ll
+++ b/test/Transforms/DeadStoreElimination/dominate.ll
@@ -9,12 +9,12 @@ bb1:
   br label %bb3
 
 bb2:
-  call void @llvm.lifetime.end(i64 -1, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %0)
   br label %bb3
 
 bb3:
   call void @bar()
-  call void @llvm.lifetime.end(i64 -1, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %0)
   br label %bb4
 
 bb4:
@@ -22,4 +22,4 @@ bb4:
 
 }
 
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
diff --git a/test/Transforms/DeadStoreElimination/lifetime.ll b/test/Transforms/DeadStoreElimination/lifetime.ll
index 305c916dc02b..97f199b5e0f6 100644
--- a/test/Transforms/DeadStoreElimination/lifetime.ll
+++ b/test/Transforms/DeadStoreElimination/lifetime.ll
@@ -2,8 +2,8 @@
 
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
 declare void @llvm.memset.p0i8.i8(i8* nocapture, i8, i8, i32, i1) nounwind
 
 define void @test1() {
@@ -11,7 +11,7 @@ define void @test1() {
   %A = alloca i8
 
   store i8 0, i8* %A  ;; Written to by memset
-  call void @llvm.lifetime.end(i64 1, i8* %A)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %A)
 ; CHECK: lifetime.end
 
   call void @llvm.memset.p0i8.i8(i8* %A, i8 0, i8 -1, i32 0, i1 false)
@@ -25,11 +25,11 @@ define void @test2(i32* %P) {
 ; CHECK: test2
   %Q = getelementptr i32, i32* %P, i32 1
   %R = bitcast i32* %Q to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %R)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %R)
 ; CHECK: lifetime.start
   store i32 0, i32* %Q  ;; This store is dead.
 ; CHECK-NOT: store
-  call void @llvm.lifetime.end(i64 4, i8* %R)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %R)
 ; CHECK: lifetime.end
   ret void
 }
diff --git a/test/Transforms/DeadStoreElimination/operand-bundles.ll b/test/Transforms/DeadStoreElimination/operand-bundles.ll
index d71b9673ed1d..784b2e8e55f7 100644
--- a/test/Transforms/DeadStoreElimination/operand-bundles.ll
+++ b/test/Transforms/DeadStoreElimination/operand-bundles.ll
@@ -41,3 +41,15 @@ define void @test3() {
   store i64 0, i64* %s
   ret void
 }
+
+declare noalias i8* @calloc(i64, i64)
+
+define void @test4() {
+; CHECK-LABEL: @test4
+  %local_obj = call i8* @calloc(i64 1, i64 4)
+  call void @foo() ["deopt" (i8* %local_obj)]
+  store i8 0, i8* %local_obj, align 4
+  ; CHECK-NOT: store i8 0, i8* %local_obj, align 4
+  call void @bar(i8* nocapture %local_obj)
+  ret void
+}
diff --git a/test/Transforms/EarlyCSE/readnone-mayunwind.ll b/test/Transforms/EarlyCSE/readnone-mayunwind.ll
new file mode 100644
index 000000000000..47a513f2d6a6
--- /dev/null
+++ b/test/Transforms/EarlyCSE/readnone-mayunwind.ll
@@ -0,0 +1,15 @@
+; RUN: opt -S -early-cse < %s | FileCheck %s
+
+declare void @readnone_may_unwind() readnone
+
+define void @f(i32* %ptr) {
+; CHECK-LABEL: @f(
+; CHECK: store i32 100, i32* %ptr
+; CHECK: call void @readnone_may_unwind()
+; CHECK: store i32 200, i32* %ptr
+
+  store i32 100, i32* %ptr
+  call void @readnone_may_unwind()
+  store i32 200, i32* %ptr
+  ret void
+}
diff --git a/test/Transforms/FunctionAttrs/nonnull.ll b/test/Transforms/FunctionAttrs/nonnull.ll
index 1fb64b7434ab..4a1ff14b2041 100644
--- a/test/Transforms/FunctionAttrs/nonnull.ll
+++ b/test/Transforms/FunctionAttrs/nonnull.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -functionattrs %s | FileCheck %s
+; RUN: opt -S -functionattrs -enable-nonnull-arg-prop %s | FileCheck %s
 declare nonnull i8* @ret_nonnull()
 
 ; Return a pointer trivially nonnull (call return attribute)
@@ -71,4 +71,148 @@ exit:
   ret i8* %phi
 }
 
+; Test propagation of nonnull callsite args back to caller.
+
+declare void @use1(i8* %x)
+declare void @use2(i8* %x, i8* %y);
+declare void @use3(i8* %x, i8* %y, i8* %z);
+
+declare void @use1nonnull(i8* nonnull %x);
+declare void @use2nonnull(i8* nonnull %x, i8* nonnull %y);
+declare void @use3nonnull(i8* nonnull %x, i8* nonnull %y, i8* nonnull %z);
+
+declare i8 @use1safecall(i8* %x) readonly nounwind ; readonly+nounwind guarantees that execution continues to successor
+
+; Can't extend non-null to parent for any argument because the 2nd call is not guaranteed to execute.
+
+define void @parent1(i8* %a, i8* %b, i8* %c) {
+; CHECK-LABEL: @parent1(i8* %a, i8* %b, i8* %c)
+; CHECK-NEXT:    call void @use3(i8* %c, i8* %a, i8* %b)
+; CHECK-NEXT:    call void @use3nonnull(i8* %b, i8* %c, i8* %a)
+; CHECK-NEXT:    ret void
+;
+  call void @use3(i8* %c, i8* %a, i8* %b)
+  call void @use3nonnull(i8* %b, i8* %c, i8* %a)
+  ret void
+}
+
+; Extend non-null to parent for all arguments.
+
+define void @parent2(i8* %a, i8* %b, i8* %c) {
+; CHECK-LABEL: @parent2(i8* nonnull %a, i8* nonnull %b, i8* nonnull %c)
+; CHECK-NEXT:    call void @use3nonnull(i8* %b, i8* %c, i8* %a)
+; CHECK-NEXT:    call void @use3(i8* %c, i8* %a, i8* %b)
+; CHECK-NEXT:    ret void
+;
+  call void @use3nonnull(i8* %b, i8* %c, i8* %a)
+  call void @use3(i8* %c, i8* %a, i8* %b)
+  ret void
+}
+
+; Extend non-null to parent for 1st argument.
+
+define void @parent3(i8* %a, i8* %b, i8* %c) {
+; CHECK-LABEL: @parent3(i8* nonnull %a, i8* %b, i8* %c)
+; CHECK-NEXT:    call void @use1nonnull(i8* %a)
+; CHECK-NEXT:    call void @use3(i8* %c, i8* %b, i8* %a)
+; CHECK-NEXT:    ret void
+;
+  call void @use1nonnull(i8* %a)
+  call void @use3(i8* %c, i8* %b, i8* %a)
+  ret void
+}
+
+; Extend non-null to parent for last 2 arguments.
+
+define void @parent4(i8* %a, i8* %b, i8* %c) {
+; CHECK-LABEL: @parent4(i8* %a, i8* nonnull %b, i8* nonnull %c)
+; CHECK-NEXT:    call void @use2nonnull(i8* %c, i8* %b)
+; CHECK-NEXT:    call void @use2(i8* %a, i8* %c)
+; CHECK-NEXT:    call void @use1(i8* %b)
+; CHECK-NEXT:    ret void
+;
+  call void @use2nonnull(i8* %c, i8* %b)
+  call void @use2(i8* %a, i8* %c)
+  call void @use1(i8* %b)
+  ret void
+}
+
+; The callsite must execute in order for the attribute to transfer to the parent.
+; It appears benign to extend non-null to the parent in this case, but we can't do that
+; because it would incorrectly propagate the wrong information to its callers.
+
+define void @parent5(i8* %a, i1 %a_is_notnull) {
+; CHECK-LABEL: @parent5(i8* %a, i1 %a_is_notnull)
+; CHECK-NEXT:    br i1 %a_is_notnull, label %t, label %f
+; CHECK:       t:
+; CHECK-NEXT:    call void @use1nonnull(i8* %a)
+; CHECK-NEXT:    ret void
+; CHECK:       f:
+; CHECK-NEXT:    ret void
+;
+  br i1 %a_is_notnull, label %t, label %f
+t:
+  call void @use1nonnull(i8* %a)
+  ret void
+f:
+  ret void
+}
+
+; The callsite must execute in order for the attribute to transfer to the parent.
+; The volatile load might trap, so there's no guarantee that we'll ever get to the call.
+
+define i8 @parent6(i8* %a, i8* %b) {
+; CHECK-LABEL: @parent6(i8* %a, i8* %b)
+; CHECK-NEXT:    [[C:%.*]] = load volatile i8, i8* %b
+; CHECK-NEXT:    call void @use1nonnull(i8* %a)
+; CHECK-NEXT:    ret i8 [[C]]
+;
+  %c = load volatile i8, i8* %b
+  call void @use1nonnull(i8* %a)
+  ret i8 %c
+}
+
+; The nonnull callsite is guaranteed to execute, so the argument must be nonnull throughout the parent.
+
+define i8 @parent7(i8* %a) {
+; CHECK-LABEL: @parent7(i8* nonnull %a)
+; CHECK-NEXT:    [[RET:%.*]] = call i8 @use1safecall(i8* %a)
+; CHECK-NEXT:    call void @use1nonnull(i8* %a)
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+  %ret = call i8 @use1safecall(i8* %a)
+  call void @use1nonnull(i8* %a)
+  ret i8 %ret
+}
+
+; Make sure that an invoke works similarly to a call.
+
+declare i32 @esfp(...)
+
+define i1 @parent8(i8* %a, i8* %bogus1, i8* %b) personality i8* bitcast (i32 (...)* @esfp to i8*){
+; CHECK-LABEL: @parent8(i8* nonnull %a, i8* nocapture readnone %bogus1, i8* nonnull %b)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    invoke void @use2nonnull(i8* %a, i8* %b)
+; CHECK-NEXT:    to label %cont unwind label %exc
+; CHECK:       cont:
+; CHECK-NEXT:    [[NULL_CHECK:%.*]] = icmp eq i8* %b, null
+; CHECK-NEXT:    ret i1 [[NULL_CHECK]]
+; CHECK:       exc:
+; CHECK-NEXT:    [[LP:%.*]] = landingpad { i8*, i32 }
+; CHECK-NEXT:    filter [0 x i8*] zeroinitializer
+; CHECK-NEXT:    unreachable
+;
+entry:
+  invoke void @use2nonnull(i8* %a, i8* %b)
+  to label %cont unwind label %exc
+
+cont:
+  %null_check = icmp eq i8* %b, null
+  ret i1 %null_check
+
+exc:
+  %lp = landingpad { i8*, i32 }
+  filter [0 x i8*] zeroinitializer
+  unreachable
+}
 
diff --git a/test/Transforms/FunctionImport/funcimport.ll b/test/Transforms/FunctionImport/funcimport.ll
index 97c18488af64..cc732a3bd98d 100644
--- a/test/Transforms/FunctionImport/funcimport.ll
+++ b/test/Transforms/FunctionImport/funcimport.ll
@@ -4,20 +4,16 @@
 ; RUN: llvm-lto -thinlto -print-summary-global-ids -o %t3 %t.bc %t2.bc 2>&1 | FileCheck %s --check-prefix=GUID
 
 ; Do the import now
-; RUN: opt -disable-force-link-odr -function-import -stats -print-imports -enable-import-metadata -summary-file %t3.thinlto.bc %t.bc -S 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=INSTLIMDEF
+; RUN: opt -function-import -stats -print-imports -enable-import-metadata -summary-file %t3.thinlto.bc %t.bc -S 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=INSTLIMDEF
 ; Try again with new pass manager
-; RUN: opt -disable-force-link-odr -passes='function-import' -stats -print-imports -enable-import-metadata -summary-file %t3.thinlto.bc %t.bc -S 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=INSTLIMDEF
+; RUN: opt -passes='function-import' -stats -print-imports -enable-import-metadata -summary-file %t3.thinlto.bc %t.bc -S 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=INSTLIMDEF
 ; "-stats" requires +Asserts.
 ; REQUIRES: asserts
 
 ; Test import with smaller instruction limit
-; RUN: opt -disable-force-link-odr -function-import -enable-import-metadata  -summary-file %t3.thinlto.bc %t.bc -import-instr-limit=5 -S | FileCheck %s --check-prefix=CHECK --check-prefix=INSTLIM5
+; RUN: opt -function-import -enable-import-metadata  -summary-file %t3.thinlto.bc %t.bc -import-instr-limit=5 -S | FileCheck %s --check-prefix=CHECK --check-prefix=INSTLIM5
 ; INSTLIM5-NOT: @staticfunc.llvm.
 
-; Test import with smaller instruction limit and without the -disable-force-link-odr
-; RUN: opt -function-import -summary-file %t3.thinlto.bc %t.bc -import-instr-limit=5 -S | FileCheck %s --check-prefix=INSTLIM5ODR
-; INSTLIM5ODR: define linkonce_odr void @linkonceodr() {
-
 
 define i32 @main() #0 {
 entry:
@@ -44,10 +40,12 @@ declare void @weakalias(...) #1
 ; CHECK-DAG: declare void @analias
 declare void @analias(...) #1
 
-; FIXME: Add this checking back when follow on fix to add alias summary
-; records is committed.
 ; Aliases import the aliasee function
 declare void @linkoncealias(...) #1
+; INSTLIMDEF-DAG: Import linkoncealias
+; INSTLIMDEF-DAG: Import linkoncefunc
+; CHECK-DAG: define linkonce_odr void @linkoncefunc()
+; CHECK-DAG: @linkoncealias = alias void (...), bitcast (void ()* @linkoncefunc to void (...)*
 
 ; INSTLIMDEF-DAG: Import referencestatics
 ; INSTLIMDEF-DAG: define available_externally i32 @referencestatics(i32 %i) !thinlto_src_module !0 {
diff --git a/test/Transforms/FunctionImport/unnamed-globals.ll b/test/Transforms/FunctionImport/unnamed-globals.ll
new file mode 100644
index 000000000000..167fad28f439
--- /dev/null
+++ b/test/Transforms/FunctionImport/unnamed-globals.ll
@@ -0,0 +1,10 @@
+; Make sure we don't crash when referencing an unnamed global.
+; RUN: opt %s -module-summary-analysis -S
+
+@0 = external global [1 x { i64 }]
+
+define internal void @tinkywinky() {
+  call void @patatino(i64 ptrtoint ([1 x { i64 }]* @0 to i64), i64 4)
+  ret void
+}
+declare void @patatino(i64, i64)
diff --git a/test/Transforms/GVN/PRE/rle-addrspace-cast.ll b/test/Transforms/GVN/PRE/rle-addrspace-cast.ll
index 07fd7c11d1b5..d8de5b360ba1 100644
--- a/test/Transforms/GVN/PRE/rle-addrspace-cast.ll
+++ b/test/Transforms/GVN/PRE/rle-addrspace-cast.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -default-data-layout="e-p:32:32:32-p1:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-n8:16:32" -basicaa -gvn -S -die | FileCheck %s
+; RUN: opt < %s -data-layout="e-p:32:32:32-p1:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-n8:16:32" -basicaa -gvn -S -die | FileCheck %s
 
 define i8 @coerce_offset0_addrspacecast(i32 %V, i32* %P) {
   store i32 %V, i32* %P
diff --git a/test/Transforms/GVN/PRE/rle.ll b/test/Transforms/GVN/PRE/rle.ll
index c1946faab20e..1d2cba2f1f64 100644
--- a/test/Transforms/GVN/PRE/rle.ll
+++ b/test/Transforms/GVN/PRE/rle.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -default-data-layout="e-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-n8:16:32" -basicaa -gvn -S -die | FileCheck %s
-; RUN: opt < %s -default-data-layout="E-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-n32"      -basicaa -gvn -S -die | FileCheck %s
+; RUN: opt < %s -data-layout="e-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-n8:16:32" -basicaa -gvn -S -die | FileCheck %s
+; RUN: opt < %s -data-layout="E-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-n32"      -basicaa -gvn -S -die | FileCheck %s
 
 ;; Trivial RLE test.
 define i32 @test0(i32 %V, i32* %P) {
diff --git a/test/Transforms/GVN/cond_br2.ll b/test/Transforms/GVN/cond_br2.ll
index baa282ec200c..a3749510cb4a 100644
--- a/test/Transforms/GVN/cond_br2.ll
+++ b/test/Transforms/GVN/cond_br2.ll
@@ -18,7 +18,7 @@ define void @_Z4testv() #0 personality i8* bitcast (i32 (...)* @__gxx_personalit
 entry:
   %sv = alloca %"class.llvm::SmallVector", align 16
   %0 = bitcast %"class.llvm::SmallVector"* %sv to i8*
-  call void @llvm.lifetime.start(i64 64, i8* %0) #1
+  call void @llvm.lifetime.start.p0i8(i64 64, i8* %0) #1
   %BeginX.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", %"class.llvm::SmallVector"* %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 0
   %FirstEl.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", %"class.llvm::SmallVector"* %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 3
   %1 = bitcast %"union.llvm::SmallVectorBase::U"* %FirstEl.i.i.i.i.i.i to i8*
@@ -94,7 +94,7 @@ if.then.i.i.i20:                                  ; preds = %invoke.cont3
   br label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21
 
 _ZN4llvm11SmallVectorIiLj8EED1Ev.exit21:          ; preds = %invoke.cont3, %if.then.i.i.i20
-  call void @llvm.lifetime.end(i64 64, i8* %0) #1
+  call void @llvm.lifetime.end.p0i8(i64 64, i8* %0) #1
   ret void
 
 lpad:                                             ; preds = %if.end.i14, %if.end.i, %invoke.cont2
@@ -113,14 +113,14 @@ eh.resume:                                        ; preds = %if.then.i.i.i, %lpa
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 
 declare i32 @__gxx_personality_v0(...)
 
 declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(%"class.llvm::SmallVector"*) #2
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(%"class.llvm::SmallVectorBase"*, i64, i64) #2
 
diff --git a/test/Transforms/GVN/debugloc.ll b/test/Transforms/GVN/debugloc.ll
new file mode 100644
index 000000000000..d8c1632d1442
--- /dev/null
+++ b/test/Transforms/GVN/debugloc.ll
@@ -0,0 +1,77 @@
+; RUN: opt < %s -gvn -S | FileCheck %s
+; CHECK: {{^}}for.body:
+; CHECK-NEXT: [[VREG1:%[^ ]+]] = phi{{.*}}[[VREG2:%[^ ]+]],{{.*}}%.sink,
+; CHECK-NOT: !dbg
+; CHECK-SAME: {{$}}
+; CHECK: {{^}}for.inc:
+; CHECK-NEXT: [[VREG2]] = phi{{.*}}%inc,{{.*}}[[VREG1]]
+
+target triple = "x86_64-unknown-linux-gnu"
+
+@g = external local_unnamed_addr global i32, align 4
+
+; Function Attrs: nounwind uwtable
+define void @foo(i32 %x, i32 %y, i32 %z) local_unnamed_addr #0 !dbg !4 {
+entry:
+  %not.tobool = icmp eq i32 %x, 0, !dbg !8
+  %.sink = zext i1 %not.tobool to i32, !dbg !8
+  store i32 %.sink, i32* @g, align 4, !tbaa !9
+  %cmp8 = icmp sgt i32 %y, 0, !dbg !13
+  br i1 %cmp8, label %for.body.preheader, label %for.end, !dbg !17
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !19
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %i.09 = phi i32 [ %inc4, %for.inc ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp sgt i32 %i.09, %z, !dbg !19
+  br i1 %cmp1, label %if.then2, label %for.inc, !dbg !21
+
+if.then2:                                         ; preds = %for.body
+  %0 = load i32, i32* @g, align 4, !dbg !22, !tbaa !9
+  %inc = add nsw i32 %0, 1, !dbg !22
+  store i32 %inc, i32* @g, align 4, !dbg !22, !tbaa !9
+  br label %for.inc, !dbg !23
+
+for.inc:                                          ; preds = %for.body, %if.then2
+  %inc4 = add nuw nsw i32 %i.09, 1, !dbg !24
+  %exitcond = icmp ne i32 %inc4, %y, !dbg !13
+  br i1 %exitcond, label %for.body, label %for.end.loopexit, !dbg !17
+
+for.end.loopexit:                                 ; preds = %for.inc
+  br label %for.end, !dbg !26
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void, !dbg !26
+}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1)
+!1 = !DIFile(filename: "foo.c", directory: "b/")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!5 = !DISubroutineType(types: !6)
+!6 = !{null, !7, !7, !7}
+!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!8 = !DILocation(line: 4, column: 7, scope: !4)
+!9 = !{!10, !10, i64 0}
+!10 = !{!"int", !11, i64 0}
+!11 = !{!"omnipotent char", !12, i64 0}
+!12 = !{!"Simple C/C++ TBAA"}
+!13 = !DILocation(line: 10, column: 13, scope: !14)
+!14 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 1)
+!15 = distinct !DILexicalBlock(scope: !16, file: !1, line: 10, column: 3)
+!16 = distinct !DILexicalBlock(scope: !4, file: !1, line: 10, column: 3)
+!17 = !DILocation(line: 10, column: 3, scope: !18)
+!18 = !DILexicalBlockFile(scope: !16, file: !1, discriminator: 1)
+!19 = !DILocation(line: 11, column: 11, scope: !20)
+!20 = distinct !DILexicalBlock(scope: !15, file: !1, line: 11, column: 9)
+!21 = !DILocation(line: 11, column: 9, scope: !15)
+!22 = !DILocation(line: 12, column: 8, scope: !20)
+!23 = !DILocation(line: 12, column: 7, scope: !20)
+!24 = !DILocation(line: 10, column: 20, scope: !25)
+!25 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 2)
+!26 = !DILocation(line: 13, column: 1, scope: !4)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
diff --git a/test/Transforms/GVN/fence.ll b/test/Transforms/GVN/fence.ll
index f68861dad1ac..a2d98e13b380 100644
--- a/test/Transforms/GVN/fence.ll
+++ b/test/Transforms/GVN/fence.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -S -basicaa -gvn < %s | FileCheck %s
 
+@a = external constant i32
 ; We can value forward across the fence since we can (semantically) 
 ; reorder the following load before the fence.
 define i32 @test(i32* %addr.i) {
@@ -52,6 +53,25 @@ define i32 @test3(i32* noalias %addr.i, i32* noalias %otheraddr) {
   ret i32 %res
 }
 
+; We can forward the value forward the load
+; across both the fences, because the load is from
+; a constant memory location.
+define i32 @test4(i32* %addr) {
+; CHECK-LABEL: @test4
+; CHECK-NOT: load
+; CHECK: fence release
+; CHECK: store
+; CHECK: fence seq_cst
+; CHECK: ret i32 0
+  %var = load i32, i32* @a
+  fence release
+  store i32 42, i32* %addr, align 8
+  fence seq_cst
+  %var2 = load i32, i32* @a
+  %var3 = sub i32 %var, %var2
+  ret i32 %var3
+}
+
 ; Another example of why forwarding across an acquire fence is problematic
 ; can be seen in a normal locking operation.  Say we had:
 ; *p = 5; unlock(l); lock(l); use(p);
diff --git a/test/Transforms/GVN/invariant.group.ll b/test/Transforms/GVN/invariant.group.ll
index 6f1f357cad65..570519bec520 100644
--- a/test/Transforms/GVN/invariant.group.ll
+++ b/test/Transforms/GVN/invariant.group.ll
@@ -382,12 +382,12 @@ define void @testNotGlobal() {
 
    %b0 = bitcast i8* %a to i1*
    call void @fooBit(i1* %b0, i1 1)
-; CHECK: %trunc = trunc i8 %b to i1
+; CHECK: %1 = trunc i8 %b to i1
    %2 = load i1, i1* %b0, !invariant.group !0
-; CHECK-NEXT: call void @fooBit(i1* %b0, i1 %trunc)
+; CHECK-NEXT: call void @fooBit(i1* %b0, i1 %1)
    call void @fooBit(i1* %b0, i1 %2)
    %3 = load i1, i1* %b0, !invariant.group !0
-; CHECK-NEXT: call void @fooBit(i1* %b0, i1 %trunc)
+; CHECK-NEXT: call void @fooBit(i1* %b0, i1 %1)
    call void @fooBit(i1* %b0, i1 %3)
    ret void
 }
diff --git a/test/Transforms/GVN/lifetime-simple.ll b/test/Transforms/GVN/lifetime-simple.ll
index d03b62c8158a..8da3e4cbd30f 100644
--- a/test/Transforms/GVN/lifetime-simple.ll
+++ b/test/Transforms/GVN/lifetime-simple.ll
@@ -8,13 +8,13 @@ define i8 @test(i8* %P) nounwind {
 ; CHECK-NOT: load
 ; CHECK: lifetime.end
 entry:
-  call void @llvm.lifetime.start(i64 32, i8* %P)
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %P)
   %0 = load i8, i8* %P
   store i8 1, i8* %P
-  call void @llvm.lifetime.end(i64 32, i8* %P)
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %P)
   %1 = load i8, i8* %P
   ret i8 %1
 }
 
-declare void @llvm.lifetime.start(i64 %S, i8* nocapture %P) readonly
-declare void @llvm.lifetime.end(i64 %S, i8* nocapture %P)
+declare void @llvm.lifetime.start.p0i8(i64 %S, i8* nocapture %P) readonly
+declare void @llvm.lifetime.end.p0i8(i64 %S, i8* nocapture %P)
diff --git a/test/Transforms/GVNHoist/hoist-inline.ll b/test/Transforms/GVNHoist/hoist-inline.ll
new file mode 100644
index 000000000000..7d761486ab15
--- /dev/null
+++ b/test/Transforms/GVNHoist/hoist-inline.ll
@@ -0,0 +1,38 @@
+; RUN: opt -S -O2 < %s | FileCheck %s
+
+; Check that the inlined loads are hoisted.
+; CHECK-LABEL: define i32 @fun(
+; CHECK-LABEL: entry:
+; CHECK: load i32, i32* @A
+; CHECK: if.then:
+
+@A = external global i32
+@B = external global i32
+@C = external global i32
+
+define i32 @loadA() {
+   %a = load i32, i32* @A
+   ret i32 %a
+}
+
+define i32 @fun(i1 %c) {
+entry:
+  br i1 %c, label %if.then, label %if.else
+
+if.then:
+  store i32 1, i32* @B
+  %call1 = call i32 @loadA()
+  store i32 2, i32* @C
+  br label %if.endif
+
+if.else:
+  store i32 2, i32* @C
+  %call2 = call i32 @loadA()
+  store i32 1, i32* @B
+  br label %if.endif
+
+if.endif:
+  %ret = phi i32 [ %call1, %if.then ], [ %call2, %if.else ]
+  ret i32 %ret
+}
+
diff --git a/test/Transforms/GVNHoist/hoist-pr31891.ll b/test/Transforms/GVNHoist/hoist-pr31891.ll
new file mode 100644
index 000000000000..3f6a22fc54a6
--- /dev/null
+++ b/test/Transforms/GVNHoist/hoist-pr31891.ll
@@ -0,0 +1,83 @@
+; RUN: opt -S -gvn-hoist < %s | FileCheck %s
+
+; Hoisted inlinable calls need to have accurate scope information, but we're
+; allowed to erase the line information.
+
+source_filename = "t.c"
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.0.24215"
+
+; Function Attrs: noinline nounwind readnone uwtable
+define float @fabsf(float %f) #0 !dbg !7 {
+entry:
+  %conv = fpext float %f to double, !dbg !9
+  %call = call double @fabs(double %conv) #1, !dbg !10
+  %conv1 = fptrunc double %call to float, !dbg !11
+  ret float %conv1, !dbg !12
+}
+
+; Function Attrs: nounwind readnone
+declare double @fabs(double) #1
+
+; Function Attrs: noinline nounwind uwtable
+define void @hoistit(i32 %cond, float %f) #2 !dbg !13 {
+entry:
+  %tobool = icmp ne i32 %cond, 0, !dbg !14
+  br i1 %tobool, label %if.then, label %if.else, !dbg !14
+
+if.then:                                          ; preds = %entry
+  %call = call float @fabsf(float %f) #1, !dbg !15
+  call void @useit1(float %call), !dbg !16
+  br label %if.end, !dbg !18
+
+if.else:                                          ; preds = %entry
+  %call1 = call float @fabsf(float %f) #1, !dbg !19
+  call void @useit2(float %call1), !dbg !20
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void, !dbg !21
+}
+
+; CHECK-LABEL: define void @hoistit
+; CHECK-SAME: 		!dbg ![[sp_hoistit:[0-9]+]]
+; CHECK: call float @fabsf(float %f) {{.*}} !dbg ![[dbgloc:[0-9]+]]
+; CHECK: br i1 %tobool, label %if.then, label %if.else
+
+; CHECK: ![[sp_hoistit]] = distinct !DISubprogram(name: "hoistit", {{.*}})
+; CHECK: ![[dbgloc]] = !DILocation({{.*}}, scope: ![[sp_hoistit]])
+
+declare void @useit1(float)
+
+declare void @useit2(float)
+
+attributes #0 = { noinline nounwind readnone uwtable }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noinline nounwind uwtable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 ", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "t.c", directory: "C:\5Csrc\5Cllvm\5Cbuild")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"PIC Level", i32 2}
+!6 = !{!"clang version 5.0.0 "}
+!7 = distinct !DISubprogram(name: "fabsf", scope: !1, file: !1, line: 4, type: !8, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 5, column: 22, scope: !7)
+!10 = !DILocation(line: 5, column: 17, scope: !7)
+!11 = !DILocation(line: 5, column: 10, scope: !7)
+!12 = !DILocation(line: 5, column: 3, scope: !7)
+!13 = distinct !DISubprogram(name: "hoistit", scope: !1, file: !1, line: 7, type: !8, isLocal: false, isDefinition: true, scopeLine: 7, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!14 = !DILocation(line: 8, column: 7, scope: !13)
+!15 = !DILocation(line: 9, column: 12, scope: !13)
+!16 = !DILocation(line: 9, column: 5, scope: !17)
+!17 = !DILexicalBlockFile(scope: !13, file: !1, discriminator: 1)
+!18 = !DILocation(line: 10, column: 3, scope: !13)
+!19 = !DILocation(line: 11, column: 12, scope: !13)
+!20 = !DILocation(line: 11, column: 5, scope: !17)
+!21 = !DILocation(line: 13, column: 1, scope: !13)
diff --git a/test/Transforms/GVNHoist/hoist-very-busy.ll b/test/Transforms/GVNHoist/hoist-very-busy.ll
new file mode 100644
index 000000000000..f421eff9921a
--- /dev/null
+++ b/test/Transforms/GVNHoist/hoist-very-busy.ll
@@ -0,0 +1,55 @@
+; RUN: opt -S -gvn-hoist < %s | FileCheck %s
+
+%struct.__jmp_buf_tag = type { [8 x i64], i32 }
+
+; Check that hoisting only happens when the expression is very busy.
+; CHECK: store
+; CHECK: store
+
+@test_exit_buf = global %struct.__jmp_buf_tag zeroinitializer
+@G = global i32 0
+
+define void @test_command(i32 %c1) {
+entry:
+  switch i32 %c1, label %exit [
+    i32 0, label %sw0
+    i32 1, label %sw1
+  ]
+
+sw0:
+  store i32 1, i32* @G
+  br label %exit
+
+sw1:
+  store i32 1, i32* @G
+  br label %exit
+
+exit:
+  call void @longjmp(%struct.__jmp_buf_tag* @test_exit_buf, i32 1) #0
+  unreachable
+}
+
+declare void @longjmp(%struct.__jmp_buf_tag*, i32) #0
+
+attributes #0 = { noreturn nounwind }
+
+; Check that the store is hoisted.
+; CHECK-LABEL: define void @fun(
+; CHECK: store
+; CHECK-NOT: store
+
+define void @fun() {
+entry:
+  br label %if.then
+
+if.then:                                          ; preds = %entry
+  br i1 undef, label %sw0, label %sw1
+
+sw0:
+  store i32 1, i32* @G
+  unreachable
+
+sw1:
+  store i32 1, i32* @G
+  ret void
+}
diff --git a/test/Transforms/InstMerge/ld_hoist1.ll b/test/Transforms/GVNHoist/ld_hoist1.ll
index 74c8900b8ab1..8d4698d87e6f 100644
--- a/test/Transforms/InstMerge/ld_hoist1.ll
+++ b/test/Transforms/GVNHoist/ld_hoist1.ll
@@ -1,5 +1,5 @@
 ; Test load hoist
-; RUN: opt -basicaa -memdep -mldst-motion -S < %s | FileCheck %s
+; RUN: opt -gvn-hoist -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-pc_linux"
 
diff --git a/test/Transforms/InstMerge/ld_hoist_st_sink.ll b/test/Transforms/GVNHoist/ld_hoist_st_sink.ll
index 1d3f941882e5..c85edc2d8170 100644
--- a/test/Transforms/InstMerge/ld_hoist_st_sink.ll
+++ b/test/Transforms/GVNHoist/ld_hoist_st_sink.ll
@@ -1,6 +1,6 @@
 ; Tests to make sure that loads and stores in a diamond get merged
 ; Loads are hoisted into the header. Stores sunks into the footer.
-; RUN: opt -basicaa -memdep -mldst-motion -S < %s | FileCheck %s
+; RUN: opt -gvn-hoist -S < %s | FileCheck %s
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
 %struct.node = type { i64, %struct.node*, %struct.node*, %struct.node*, i64, %struct.arc*, i64, i64, i64 }
@@ -41,7 +41,7 @@ if.then:                                          ; preds = %while.body
   %4 = load i64, i64* %p, align 8
   %add = add nsw i64 %4, %2
   %p1 = getelementptr inbounds %struct.node, %struct.node* %node.020, i64 0, i32 6
-; CHECK-NOT: store i64
+; FIXME: store i64
   store i64 %add, i64* %p1, align 8
   br label %if.end
 
@@ -61,13 +61,13 @@ if.else:                                          ; preds = %while.body
   %8 = load i64, i64* %cost5, align 8
   %sub = sub nsw i64 %6, %8
   %p6 = getelementptr inbounds %struct.node, %struct.node* %node.020, i64 0, i32 6
-; CHECK-NOT: store i64
+; FIXME: store i64
   store i64 %sub, i64* %p6, align 8
   br label %if.end
 
 ; CHECK: if.end
 if.end:                                           ; preds = %if.else, %if.then
-; CHECK: store
+; FIXME: store
   %inc = add nsw i64 %sum.019, 1
   %node.0.in = getelementptr inbounds %struct.node, %struct.node* %node.020, i64 0, i32 2
   %node.0 = load %struct.node*, %struct.node** %node.0.in, align 8
diff --git a/test/Transforms/GVNHoist/pr29034.ll b/test/Transforms/GVNHoist/pr29034.ll
index 5e725ad38c86..c0fcc3e741a8 100644
--- a/test/Transforms/GVNHoist/pr29034.ll
+++ b/test/Transforms/GVNHoist/pr29034.ll
@@ -38,7 +38,7 @@ define void @music_task(i8* nocapture readnone %p) local_unnamed_addr {
 entry:
   %mapi = alloca %struct._MUSIC_OP_API_*, align 8
   %0 = bitcast %struct._MUSIC_OP_API_** %mapi to i8*
-  call void @llvm.lifetime.start(i64 8, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* %0)
   store %struct._MUSIC_OP_API_* null, %struct._MUSIC_OP_API_** %mapi, align 8, !tbaa !1
   %call = call i32 @music_decoder_init(%struct._MUSIC_OP_API_** nonnull %mapi)
   br label %while.cond
@@ -103,7 +103,7 @@ while.cond2.backedge:                             ; preds = %sw.default, %sw.bb1
   br label %while.cond2
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
 declare i32 @music_decoder_init(%struct._MUSIC_OP_API_**)
 declare i32 @music_play_api(%struct._MUSIC_OP_API_*, i32, i32, i32, i8*)
 declare i32 @printf(i8* nocapture readonly, ...)
diff --git a/test/Transforms/JumpThreading/crash-assertingvh.ll b/test/Transforms/GlobalDCE/crash-assertingvh.ll
index e78431992239..2919999d5e28 100644
--- a/test/Transforms/JumpThreading/crash-assertingvh.ll
+++ b/test/Transforms/GlobalDCE/crash-assertingvh.ll
@@ -1,4 +1,9 @@
+; Make sure that if a pass like jump threading populates a function analysis
+; like LVI with asserting handles into the body of a function, those don't begin
+; to assert when global DCE deletes the body of the function.
+;
 ; RUN: opt -disable-output < %s -passes='module(function(jump-threading),globaldce)'
+; RUN: opt -disable-output < %s -passes='module(rpo-functionattrs,globaldce)'
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/GlobalOpt/2009-03-05-dbg.ll b/test/Transforms/GlobalOpt/2009-03-05-dbg.ll
index c785e13403cc..da82b01560b3 100644
--- a/test/Transforms/GlobalOpt/2009-03-05-dbg.ll
+++ b/test/Transforms/GlobalOpt/2009-03-05-dbg.ll
@@ -77,4 +77,4 @@ attributes #1 = { nounwind readnone }
 !6 = !{i32 2, !"Dwarf Version", i32 2}
 !7 = !{i32 2, !"Debug Info Version", i32 3}
 !8 = !DILocalVariable(name: "i", arg: 1, scope: !9, file: !3, line: 4, type: !5)
-!9 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !2, line: 4, type: !10, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !2)!10 = !DISubroutineType(types: !11)!11 = !{!5, !5}!12 = !DIExpression()!13 = !DILocation(line: 5, scope: !14)!14 = distinct !DILexicalBlock(scope: !9, file: !3)!15 = !DILocation(line: 6, scope: !14)!16 = !DILocation(line: 7, scope: !14)!17 = !DILocation(line: 9, scope: !14)!18 = !DILocation(line: 11, scope: !14)!19 = !DILocation(line: 14, scope: !20)!20 = distinct !DILexicalBlock(scope: !21, file: !3)!21 = distinct !DISubprogram(name: "bar", linkageName: "bar", scope: !2, line: 13, type: !22, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !2)!22 = !DISubroutineType(types: !23)!23 = !{!5}!24 = !DILocation(line: 15, scope: !20)!25 = !DILocation(line: 16, scope: !20)
-\ No newline at end of file
+!9 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !2, file: !3, line: 4, type: !10, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !2)!10 = !DISubroutineType(types: !11)!11 = !{!5, !5}!12 = !DIExpression()!13 = !DILocation(line: 5, scope: !14)!14 = distinct !DILexicalBlock(scope: !9, file: !3)!15 = !DILocation(line: 6, scope: !14)!16 = !DILocation(line: 7, scope: !14)!17 = !DILocation(line: 9, scope: !14)!18 = !DILocation(line: 11, scope: !14)!19 = !DILocation(line: 14, scope: !20)!20 = distinct !DILexicalBlock(scope: !21, file: !3)!21 = distinct !DISubprogram(name: "bar", linkageName: "bar", scope: !2, file: !3, line: 13, type: !22, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !2)!22 = !DISubroutineType(types: !23)!23 = !{!5}!24 = !DILocation(line: 15, scope: !20)!25 = !DILocation(line: 16, scope: !20)
diff --git a/test/Transforms/GlobalOpt/externally-initialized-aggregate.ll b/test/Transforms/GlobalOpt/externally-initialized-aggregate.ll
index b446d24f1fd2..2434f20e92b2 100644
--- a/test/Transforms/GlobalOpt/externally-initialized-aggregate.ll
+++ b/test/Transforms/GlobalOpt/externally-initialized-aggregate.ll
@@ -5,11 +5,11 @@
 ; store to @a[0] from being constant propagated to the load in @foo, but will not
 ; prevent @a[1] from being removed since it is dead.
 ; CHECK: @a.0 = internal unnamed_addr externally_initialized global i32 undef
-; CHECK-NOT @a.1
+; CHECK-NOT: @a.1
 @a = internal externally_initialized global [2 x i32] undef, align 4
 ; This is the same, but a struct rather than an array.
 ; CHECK: @b.0 = internal unnamed_addr externally_initialized global i32 undef
-; CHECK-NOT @b.1
+; CHECK-NOT: @b.1
 @b = internal externally_initialized global {i32, i32} undef, align 4
 
 define i32 @foo() {
diff --git a/test/Transforms/GlobalSplit/basic.ll b/test/Transforms/GlobalSplit/basic.ll
index a0aaeffb6c3f..6834a8d18be9 100644
--- a/test/Transforms/GlobalSplit/basic.ll
+++ b/test/Transforms/GlobalSplit/basic.ll
@@ -12,13 +12,13 @@ target triple = "x86_64-unknown-linux-gnu"
 ]
 
 ; CHECK-NOT: @global =
-; CHECK: @global.0 = private constant [2 x i8* ()*] [i8* ()* @f1, i8* ()* @f2], !type [[T1:![0-9]+$]]
-; CHECK: @global.1 = private constant [1 x i8* ()*] [i8* ()* @f3], !type [[T2:![0-9]+$]]
+; CHECK: @global.0 = private constant [2 x i8* ()*] [i8* ()* @f1, i8* ()* @f2], !type [[T1:![0-9]+]], !type [[T2:![0-9]+]], !type [[T3:![0-9]+$]]
+; CHECK: @global.1 = private constant [1 x i8* ()*] [i8* ()* @f3], !type [[T4:![0-9]+]], !type [[T5:![0-9]+$]]
 ; CHECK-NOT: @global =
 @global = internal constant { [2 x i8* ()*], [1 x i8* ()*] } {
   [2 x i8* ()*] [i8* ()* @f1, i8* ()* @f2],
   [1 x i8* ()*] [i8* ()* @f3]
-}, !type !0, !type !1
+}, !type !0, !type !1, !type !2, !type !3, !type !4
 
 ; CHECK: define i8* @f1()
 define i8* @f1() {
@@ -51,7 +51,13 @@ define void @foo() {
 
 declare i1 @llvm.type.test(i8*, metadata) nounwind readnone
 
-; CHECK: [[T1]] = !{i32 8, !"foo"}
-; CHECK: [[T2]] = !{i32 0, !"bar"}
-!0 = !{i32 8, !"foo"}
-!1 = !{i32 16, !"bar"}
+; CHECK: [[T1]] = !{i32 0, !"foo"}
+; CHECK: [[T2]] = !{i32 15, !"bar"}
+; CHECK: [[T3]] = !{i32 16, !"a"}
+; CHECK: [[T4]] = !{i32 1, !"b"}
+; CHECK: [[T5]] = !{i32 8, !"c"}
+!0 = !{i32 0, !"foo"}
+!1 = !{i32 15, !"bar"}
+!2 = !{i32 16, !"a"}
+!3 = !{i32 17, !"b"}
+!4 = !{i32 24, !"c"}
diff --git a/test/Transforms/IPConstantProp/naked-return.ll b/test/Transforms/IPConstantProp/naked-return.ll
index f52417fcf7ea..3a2dedafcd37 100644
--- a/test/Transforms/IPConstantProp/naked-return.ll
+++ b/test/Transforms/IPConstantProp/naked-return.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -ipsccp -S %s | FileCheck %s
+; RUN: opt -ipconstprop -S %s | FileCheck %s
 
 target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
 target triple = "i686-pc-windows-msvc19.0.24215"
@@ -24,5 +25,5 @@ bb:
   ret void
 }
 
-attributes #0 = { naked noinline optnone }
+attributes #0 = { naked }
 attributes #1 = { "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
diff --git a/test/Transforms/IRCE/bad-loop-structure.ll b/test/Transforms/IRCE/bad-loop-structure.ll
new file mode 100644
index 000000000000..9c2e4251423d
--- /dev/null
+++ b/test/Transforms/IRCE/bad-loop-structure.ll
@@ -0,0 +1,45 @@
+; RUN: opt -S -irce -irce-print-changed-loops=true < %s | FileCheck %s
+
+; CHECK-NOT: irce
+
+define void @bad_loop_structure_increasing(i64 %iv.start) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %iv.start, %entry ], [ %indvars.iv.next, %for.inc ]
+  %cmp = icmp ult i64 %indvars.iv, 100
+  br i1 %cmp, label %switch.lookup, label %for.inc
+
+switch.lookup:
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %cmp55 = icmp slt i64 %indvars.iv.next, 11
+  br i1 %cmp55, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+define void @bad_loop_structure_decreasing(i64 %iv.start) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %iv.start, %entry ], [ %indvars.iv.next, %for.inc ]
+  %cmp = icmp ult i64 %indvars.iv, 100
+  br i1 %cmp, label %switch.lookup, label %for.inc
+
+switch.lookup:
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, -1
+  %cmp55 = icmp sgt i64 %indvars.iv.next, 11
+  br i1 %cmp55, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/IndVarSimplify/2011-11-01-lftrptr.ll b/test/Transforms/IndVarSimplify/2011-11-01-lftrptr.ll
index 402ae8cc05d0..b9d571d9b64f 100644
--- a/test/Transforms/IndVarSimplify/2011-11-01-lftrptr.ll
+++ b/test/Transforms/IndVarSimplify/2011-11-01-lftrptr.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -indvars -S "-default-data-layout=e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" | FileCheck %s
-; RUN: opt < %s -indvars -S "-default-data-layout=e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32" | FileCheck %s
+; RUN: opt < %s -indvars -S "-data-layout=e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" | FileCheck %s
+; RUN: opt < %s -indvars -S "-data-layout=e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32" | FileCheck %s
 ;
 ; PR11279: Assertion !IVLimit->getType()->isPointerTy()
 ;
diff --git a/test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll b/test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll
index aa4fb8e68eb3..36c7bd9c5ec3 100644
--- a/test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll
+++ b/test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll
@@ -14,7 +14,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; CHECK-LABEL: @indvar_32_bit(
 ; CHECK-NOT: sext i32
 ; CHECK: phi i32
-define void @indvar_32_bit(i32 %n, i32* nocapture %output) {
+define amdgpu_kernel void @indvar_32_bit(i32 %n, i32* nocapture %output) {
 entry:
   %cmp5 = icmp sgt i32 %n, 0
   br i1 %cmp5, label %for.body.preheader, label %for.end
@@ -46,7 +46,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; CHECK-NOT: ashr i64
 ; CHECK-NOT: mul nsw i64
 ; CHECK-NOT: add nsw i64
-define void @no_promote_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @no_promote_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   br label %for.body
 
@@ -72,7 +72,7 @@ for.end:
 ; be legalized anyway.
 
 ; CHECK-LABEL: @indvar_48_bit(
-define void @indvar_48_bit(i48 %n, i48* nocapture %output) {
+define amdgpu_kernel void @indvar_48_bit(i48 %n, i48* nocapture %output) {
 entry:
   %cmp5 = icmp sgt i48 %n, 0
   br i1 %cmp5, label %for.body.preheader, label %for.end
diff --git a/test/Transforms/IndVarSimplify/exit_value_test2.ll b/test/Transforms/IndVarSimplify/exit_value_test2.ll
index 24e3e95a8918..ee641667506c 100644
--- a/test/Transforms/IndVarSimplify/exit_value_test2.ll
+++ b/test/Transforms/IndVarSimplify/exit_value_test2.ll
@@ -8,14 +8,14 @@
 ; CHECK-NOT: udiv
 
 declare void @_Z3mixRjj(i32* dereferenceable(4), i32)
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
 define i32 @_Z3fooPKcjj(i8* nocapture readonly %s, i32 %len, i32 %c) {
 entry:
   %a = alloca i32, align 4
   %tmp = bitcast i32* %a to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %tmp)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %tmp)
   store i32 -1640531527, i32* %a, align 4
   %cmp8 = icmp ugt i32 %len, 11
   br i1 %cmp8, label %while.body.lr.ph, label %while.end
@@ -47,6 +47,6 @@ while.end:                                        ; preds = %while.cond.while.en
   %keylen.0.lcssa = phi i32 [ %sub.lcssa, %while.cond.while.end_crit_edge ], [ %len, %entry ]
   call void @_Z3mixRjj(i32* dereferenceable(4) %a, i32 %keylen.0.lcssa)
   %tmp4 = load i32, i32* %a, align 4
-  call void @llvm.lifetime.end(i64 4, i8* %tmp)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %tmp)
   ret i32 %tmp4
 }
diff --git a/test/Transforms/IndVarSimplify/pr32045.ll b/test/Transforms/IndVarSimplify/pr32045.ll
new file mode 100644
index 000000000000..31efac3f833c
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/pr32045.ll
@@ -0,0 +1,39 @@
+; RUN: opt -S -indvars < %s | FileCheck %s
+
+; This is not an IndVarSimplify bug, but the original symptom
+; manifested as one.
+
+define i32 @foo(i32 %a, i32 %b, i32 %c, i32* %sink) {
+; CHECK-LABEL: @foo(
+; CHECK:       for.end:
+; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 %neg3, -1
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 0, [[SHR]]
+; CHECK-NEXT:    [[SHR1:%.*]] = ashr i32 [[SUB]], [[B:%.*]]
+; CHECK-NEXT:    [[NEG:%.*]] = xor i32 [[SHR1]], -1
+; CHECK-NEXT:    store i32 [[NEG]], i32* %sink
+;
+entry:
+  %tobool2 = icmp eq i32 %a, 0
+  br i1 %tobool2, label %exit, label %preheader
+
+preheader:
+  %neg3 = phi i32 [ %c, %entry ], [ %neg, %for.end ]
+  br label %for
+
+for:
+  %p = phi i32 [ %dec, %for ], [ 1, %preheader ]
+  %cmp = icmp sgt i32 %p, -1
+  %dec = add nsw i32 %p, -1
+  br i1 %cmp, label %for, label %for.end
+
+for.end:
+  %shr = ashr i32 %neg3, %p
+  %sub = sub nsw i32 0, %shr
+  %shr1 = ashr i32 %sub, %b
+  %neg = xor i32 %shr1, -1
+  store i32 %neg, i32* %sink
+  br i1 false, label %exit, label %preheader
+
+exit:
+  ret i32 0
+}
diff --git a/test/Transforms/IndVarSimplify/replace-sdiv-by-udiv.ll b/test/Transforms/IndVarSimplify/replace-sdiv-by-udiv.ll
new file mode 100644
index 000000000000..af25b20bec37
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/replace-sdiv-by-udiv.ll
@@ -0,0 +1,130 @@
+; RUN: opt < %s -indvars -S | FileCheck %s
+
+define void @test0(i32* %a) {
+; CHECK-LABEL: @test0(
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %div = sdiv i32 %i.01, 2
+; CHECK-NOT: sdiv
+; CHECK:     udiv
+  %idxprom = sext i32 %div to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  store i32 %i.01, i32* %arrayidx, align 4
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 64
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @test1(i32* %a) {
+; CHECK-LABEL: @test1(
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %div = sdiv exact i32 %i.01, 2
+; CHECK-NOT: sdiv
+; CHECK:     udiv exact
+  %idxprom = sext i32 %div to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  store i32 %i.01, i32* %arrayidx, align 4
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 64
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @test2(i32* %a, i32 %d) {
+; CHECK-LABEL: @test2(
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %mul = mul nsw i32 %i.01, 64
+  %div = sdiv i32 %mul, %d
+; CHECK-NOT: udiv
+  %idxprom = sext i32 %div to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  store i32 %i.01, i32* %arrayidx, align 4
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 64
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @test3(i32* %a) {
+; CHECK-LABEL: @test3(
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %div = sdiv i32 2048, %i.01
+; CHECK:     udiv
+; CHECK-NOT: sdiv
+  %idxprom = sext i32 %div to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  store i32 %i.01, i32* %arrayidx, align 4
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 64
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @test4(i32* %a) {
+; CHECK-LABEL: @test4(
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %mul = mul nsw i32 %i.01, 64
+  %div = sdiv i32 %mul, 8
+; CHECK:     udiv
+; CHECK-NOT: sdiv
+  %idxprom = sext i32 %div to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  store i32 %i.01, i32* %arrayidx, align 4
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 64
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @test5(i32* %a) {
+; CHECK-LABEL: @test5(
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %mul = mul nsw i32 %i.01, 64
+  %div = sdiv i32 %mul, 6
+; CHECK:     udiv
+; CHECK-NOT: sdiv
+  %idxprom = sext i32 %div to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  store i32 %i.01, i32* %arrayidx, align 4
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 64
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
diff --git a/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll b/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll
new file mode 100644
index 000000000000..b566c147e9b8
--- /dev/null
+++ b/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll
@@ -0,0 +1,173 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces %s | FileCheck %s
+
+; Trivial optimization of generic addressing
+
+; CHECK-LABEL: @load_global_from_flat(
+; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(1)*
+; CHECK-NEXT: %tmp1 = load float, float addrspace(1)* %tmp0
+; CHECK-NEXT: ret float %tmp1
+define float @load_global_from_flat(float addrspace(4)* %generic_scalar) #0 {
+  %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(1)*
+  %tmp1 = load float, float addrspace(1)* %tmp0
+  ret float %tmp1
+}
+
+; CHECK-LABEL: @load_constant_from_flat(
+; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(2)*
+; CHECK-NEXT: %tmp1 = load float, float addrspace(2)* %tmp0
+; CHECK-NEXT: ret float %tmp1
+define float @load_constant_from_flat(float addrspace(4)* %generic_scalar) #0 {
+  %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(2)*
+  %tmp1 = load float, float addrspace(2)* %tmp0
+  ret float %tmp1
+}
+
+; CHECK-LABEL: @load_group_from_flat(
+; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(3)*
+; CHECK-NEXT: %tmp1 = load float, float addrspace(3)* %tmp0
+; CHECK-NEXT: ret float %tmp1
+define float @load_group_from_flat(float addrspace(4)* %generic_scalar) #0 {
+  %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(3)*
+  %tmp1 = load float, float addrspace(3)* %tmp0
+  ret float %tmp1
+}
+
+; CHECK-LABEL: @load_private_from_flat(
+; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float*
+; CHECK-NEXT: %tmp1 = load float, float* %tmp0
+; CHECK-NEXT: ret float %tmp1
+define float @load_private_from_flat(float addrspace(4)* %generic_scalar) #0 {
+  %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float*
+  %tmp1 = load float, float* %tmp0
+  ret float %tmp1
+}
+
+; CHECK-LABEL: @store_global_from_flat(
+; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(1)*
+; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* %tmp0
+define amdgpu_kernel void @store_global_from_flat(float addrspace(4)* %generic_scalar) #0 {
+  %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(1)*
+  store float 0.0, float addrspace(1)* %tmp0
+  ret void
+}
+
+; CHECK-LABEL: @store_group_from_flat(
+; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(3)*
+; CHECK-NEXT: store float 0.000000e+00, float addrspace(3)* %tmp0
+define amdgpu_kernel void @store_group_from_flat(float addrspace(4)* %generic_scalar) #0 {
+  %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(3)*
+  store float 0.0, float addrspace(3)* %tmp0
+  ret void
+}
+
+; CHECK-LABEL: @store_private_from_flat(
+; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float*
+; CHECK-NEXT: store float 0.000000e+00, float* %tmp0
+define amdgpu_kernel void @store_private_from_flat(float addrspace(4)* %generic_scalar) #0 {
+  %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float*
+  store float 0.0, float* %tmp0
+  ret void
+}
+
+; optimized to global load/store.
+; CHECK-LABEL: @load_store_global(
+; CHECK-NEXT: %val = load i32, i32 addrspace(1)* %input, align 4
+; CHECK-NEXT: store i32 %val, i32 addrspace(1)* %output, align 4
+; CHECK-NEXT: ret void
+define amdgpu_kernel void @load_store_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
+  %tmp0 = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
+  %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)*
+  %val = load i32, i32 addrspace(4)* %tmp0, align 4
+  store i32 %val, i32 addrspace(4)* %tmp1, align 4
+  ret void
+}
+
+; Optimized to group load/store.
+; CHECK-LABEL: @load_store_group(
+; CHECK-NEXT: %val = load i32, i32 addrspace(3)* %input, align 4
+; CHECK-NEXT: store i32 %val, i32 addrspace(3)* %output, align 4
+; CHECK-NEXT: ret void
+define amdgpu_kernel void @load_store_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
+  %tmp0 = addrspacecast i32 addrspace(3)* %input to i32 addrspace(4)*
+  %tmp1 = addrspacecast i32 addrspace(3)* %output to i32 addrspace(4)*
+  %val = load i32, i32 addrspace(4)* %tmp0, align 4
+  store i32 %val, i32 addrspace(4)* %tmp1, align 4
+  ret void
+}
+
+; Optimized to private load/store.
+; CHECK-LABEL: @load_store_private(
+; CHECK-NEXT: %val = load i32, i32* %input, align 4
+; CHECK-NEXT: store i32 %val, i32* %output, align 4
+; CHECK-NEXT: ret void
+define amdgpu_kernel void @load_store_private(i32* nocapture %input, i32* nocapture %output) #0 {
+  %tmp0 = addrspacecast i32* %input to i32 addrspace(4)*
+  %tmp1 = addrspacecast i32* %output to i32 addrspace(4)*
+  %val = load i32, i32 addrspace(4)* %tmp0, align 4
+  store i32 %val, i32 addrspace(4)* %tmp1, align 4
+  ret void
+}
+
+; No optimization. flat load/store.
+; CHECK-LABEL: @load_store_flat(
+; CHECK-NEXT: %val = load i32, i32 addrspace(4)* %input, align 4
+; CHECK-NEXT: store i32 %val, i32 addrspace(4)* %output, align 4
+; CHECK-NEXT: ret void
+define amdgpu_kernel void @load_store_flat(i32 addrspace(4)* nocapture %input, i32 addrspace(4)* nocapture %output) #0 {
+  %val = load i32, i32 addrspace(4)* %input, align 4
+  store i32 %val, i32 addrspace(4)* %output, align 4
+  ret void
+}
+
+; CHECK-LABEL: @store_addrspacecast_ptr_value(
+; CHECK: %cast = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
+; CHECK-NEXT: store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %output, align 4
+define amdgpu_kernel void @store_addrspacecast_ptr_value(i32 addrspace(1)* nocapture %input, i32 addrspace(4)* addrspace(1)* nocapture %output) #0 {
+  %cast = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
+  store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %output, align 4
+  ret void
+}
+
+; CHECK-LABEL: @atomicrmw_add_global_to_flat(
+; CHECK-NEXT: %ret = atomicrmw add i32 addrspace(1)* %global.ptr, i32 %y seq_cst
+define i32 @atomicrmw_add_global_to_flat(i32 addrspace(1)* %global.ptr, i32 %y) #0 {
+  %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)*
+  %ret = atomicrmw add i32 addrspace(4)* %cast, i32 %y seq_cst
+  ret i32 %ret
+}
+
+; CHECK-LABEL: @atomicrmw_add_group_to_flat(
+; CHECK-NEXT: %ret = atomicrmw add i32 addrspace(3)* %group.ptr, i32 %y seq_cst
+define i32 @atomicrmw_add_group_to_flat(i32 addrspace(3)* %group.ptr, i32 %y) #0 {
+  %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)*
+  %ret = atomicrmw add i32 addrspace(4)* %cast, i32 %y seq_cst
+  ret i32 %ret
+}
+
+; CHECK-LABEL: @cmpxchg_global_to_flat(
+; CHECK: %ret = cmpxchg i32 addrspace(1)* %global.ptr, i32 %cmp, i32 %val seq_cst monotonic
+define { i32, i1 } @cmpxchg_global_to_flat(i32 addrspace(1)* %global.ptr, i32 %cmp, i32 %val) #0 {
+  %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)*
+  %ret = cmpxchg i32 addrspace(4)* %cast, i32 %cmp, i32 %val seq_cst monotonic
+  ret { i32, i1 } %ret
+}
+
+; CHECK-LABEL: @cmpxchg_group_to_flat(
+; CHECK: %ret = cmpxchg i32 addrspace(3)* %group.ptr, i32 %cmp, i32 %val seq_cst monotonic
+define { i32, i1 } @cmpxchg_group_to_flat(i32 addrspace(3)* %group.ptr, i32 %cmp, i32 %val) #0 {
+  %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)*
+  %ret = cmpxchg i32 addrspace(4)* %cast, i32 %cmp, i32 %val seq_cst monotonic
+  ret { i32, i1 } %ret
+}
+
+; Not pointer operand
+; CHECK-LABEL: @cmpxchg_group_to_flat_wrong_operand(
+; CHECK: %cast.cmp = addrspacecast i32 addrspace(3)* %cmp.ptr to i32 addrspace(4)*
+; CHECK: %ret = cmpxchg i32 addrspace(4)* addrspace(3)* %cas.ptr, i32 addrspace(4)* %cast.cmp, i32 addrspace(4)* %val seq_cst monotonic
+define { i32 addrspace(4)*, i1 } @cmpxchg_group_to_flat_wrong_operand(i32 addrspace(4)* addrspace(3)* %cas.ptr, i32 addrspace(3)* %cmp.ptr, i32 addrspace(4)* %val) #0 {
+  %cast.cmp = addrspacecast i32 addrspace(3)* %cmp.ptr to i32 addrspace(4)*
+  %ret = cmpxchg i32 addrspace(4)* addrspace(3)* %cas.ptr, i32 addrspace(4)* %cast.cmp, i32 addrspace(4)* %val seq_cst monotonic
+  ret { i32 addrspace(4)*, i1 } %ret
+}
+
+attributes #0 = { nounwind }
diff --git a/test/Transforms/InferAddressSpaces/AMDGPU/icmp.ll b/test/Transforms/InferAddressSpaces/AMDGPU/icmp.ll
new file mode 100644
index 000000000000..b185ede26579
--- /dev/null
+++ b/test/Transforms/InferAddressSpaces/AMDGPU/icmp.ll
@@ -0,0 +1,160 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces %s | FileCheck %s
+
+; CHECK-LABEL: @icmp_flat_cmp_self(
+; CHECK: %cmp = icmp eq i32 addrspace(3)* %group.ptr.0, %group.ptr.0
+define i1 @icmp_flat_cmp_self(i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* %cast0, %cast0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_flat_flat_from_group(
+; CHECK: %cmp = icmp eq i32 addrspace(3)* %group.ptr.0, %group.ptr.1
+define i1 @icmp_flat_flat_from_group(i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* %cast0, %cast1
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_mismatch_flat_from_group_private(
+; CHECK: %1 = addrspacecast i32* %private.ptr.0 to i32 addrspace(4)*
+; CHECK: %2 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)*
+; CHECK: %cmp = icmp eq i32 addrspace(4)* %1, %2
+define i1 @icmp_mismatch_flat_from_group_private(i32* %private.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 {
+  %cast0 = addrspacecast i32* %private.ptr.0 to i32 addrspace(4)*
+  %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* %cast0, %cast1
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_flat_group_flat(
+; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+; CHECK: %cmp = icmp eq i32 addrspace(4)* %1, %flat.ptr.1
+define i1 @icmp_flat_group_flat(i32 addrspace(3)* %group.ptr.0, i32 addrspace(4)* %flat.ptr.1) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* %cast0, %flat.ptr.1
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_flat_flat_group(
+; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)*
+; CHECK: %cmp = icmp eq i32 addrspace(4)* %flat.ptr.0, %1
+define i1 @icmp_flat_flat_group(i32 addrspace(4)* %flat.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 {
+  %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* %flat.ptr.0, %cast1
+  ret i1 %cmp
+}
+
+; Keeping as cmp addrspace(3)* is better
+; CHECK-LABEL: @icmp_flat_to_group_cmp(
+; CHECK: %cast0 = addrspacecast i32 addrspace(4)* %flat.ptr.0 to i32 addrspace(3)*
+; CHECK: %cast1 = addrspacecast i32 addrspace(4)* %flat.ptr.1 to i32 addrspace(3)*
+; CHECK: %cmp = icmp eq i32 addrspace(3)* %cast0, %cast1
+define i1 @icmp_flat_to_group_cmp(i32 addrspace(4)* %flat.ptr.0, i32 addrspace(4)* %flat.ptr.1) #0 {
+  %cast0 = addrspacecast i32 addrspace(4)* %flat.ptr.0 to i32 addrspace(3)*
+  %cast1 = addrspacecast i32 addrspace(4)* %flat.ptr.1 to i32 addrspace(3)*
+  %cmp = icmp eq i32 addrspace(3)* %cast0, %cast1
+  ret i1 %cmp
+}
+
+; FIXME: Should be able to ask target about how to constant fold the
+; constant cast if this is OK to change if 0 is a valid pointer.
+
+; CHECK-LABEL: @icmp_group_flat_cmp_null(
+; CHECK: %cmp = icmp eq i32 addrspace(3)* %group.ptr.0, addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*)
+define i1 @icmp_group_flat_cmp_null(i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* %cast0, null
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_group_flat_cmp_constant_inttoptr(
+; CHECK: %cmp = icmp eq i32 addrspace(3)* %group.ptr.0, addrspacecast (i32 addrspace(4)* inttoptr (i64 400 to i32 addrspace(4)*) to i32 addrspace(3)*)
+define i1 @icmp_group_flat_cmp_constant_inttoptr(i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* %cast0, inttoptr (i64 400 to i32 addrspace(4)*)
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_mismatch_flat_group_private_cmp_null(
+; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+; CHECK: %cmp = icmp eq i32 addrspace(4)* %1, addrspacecast (i32* null to i32 addrspace(4)*)
+define i1 @icmp_mismatch_flat_group_private_cmp_null(i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* %cast0, addrspacecast (i32* null to i32 addrspace(4)*)
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_mismatch_flat_group_private_cmp_undef(
+; CHECK: %cmp = icmp eq i32 addrspace(3)* %group.ptr.0, undef
+define i1 @icmp_mismatch_flat_group_private_cmp_undef(i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* %cast0, addrspacecast (i32* undef to i32 addrspace(4)*)
+  ret i1 %cmp
+}
+
+@lds0 = internal addrspace(3) global i32 0, align 4
+@global0 = internal addrspace(1) global i32 0, align 4
+
+; CHECK-LABEL: @icmp_mismatch_flat_group_global_cmp_gv(
+; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+; CHECK: %cmp = icmp eq i32 addrspace(4)* %1, addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)
+define i1 @icmp_mismatch_flat_group_global_cmp_gv(i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* %cast0, addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_mismatch_group_global_cmp_gv_gv(
+; CHECK: %cmp = icmp eq i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)
+define i1 @icmp_mismatch_group_global_cmp_gv_gv(i32 addrspace(3)* %group.ptr.0) #0 {
+  %cmp = icmp eq i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_group_flat_cmp_undef(
+; CHECK: %cmp = icmp eq i32 addrspace(3)* %group.ptr.0, undef
+define i1 @icmp_group_flat_cmp_undef(i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* %cast0, undef
+  ret i1 %cmp
+}
+
+; Test non-canonical orders
+; CHECK-LABEL: @icmp_mismatch_flat_group_private_cmp_null_swap(
+; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+; CHECK: %cmp = icmp eq i32 addrspace(4)* addrspacecast (i32* null to i32 addrspace(4)*), %1
+define i1 @icmp_mismatch_flat_group_private_cmp_null_swap(i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* addrspacecast (i32* null to i32 addrspace(4)*), %cast0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_group_flat_cmp_undef_swap(
+; CHECK: %cmp = icmp eq i32 addrspace(3)* undef, %group.ptr.0
+define i1 @icmp_group_flat_cmp_undef_swap(i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* undef, %cast0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_mismatch_flat_group_private_cmp_undef_swap(
+; CHECK: %cmp = icmp eq i32 addrspace(3)* undef, %group.ptr.0
+define i1 @icmp_mismatch_flat_group_private_cmp_undef_swap(i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* addrspacecast (i32* undef to i32 addrspace(4)*), %cast0
+  ret i1 %cmp
+}
+
+; TODO: Should be handled
+; CHECK-LABEL: @icmp_flat_flat_from_group_vector(
+; CHECK: %cmp = icmp eq <2 x i32 addrspace(4)*> %cast0, %cast1
+define <2 x i1> @icmp_flat_flat_from_group_vector(<2 x i32 addrspace(3)*> %group.ptr.0, <2 x i32 addrspace(3)*> %group.ptr.1) #0 {
+  %cast0 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.0 to <2 x i32 addrspace(4)*>
+  %cast1 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.1 to <2 x i32 addrspace(4)*>
+  %cmp = icmp eq <2 x i32 addrspace(4)*> %cast0, %cast1
+  ret <2 x i1> %cmp
+}
+
+attributes #0 = { nounwind }
diff --git a/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll b/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll
new file mode 100644
index 000000000000..52067cd37bb9
--- /dev/null
+++ b/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll
@@ -0,0 +1,175 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces %s | FileCheck %s
+; Ports of most of test/CodeGen/NVPTX/access-non-generic.ll
+
+@scalar = internal addrspace(3) global float 0.0, align 4
+@array = internal addrspace(3) global [10 x float] zeroinitializer, align 4
+
+; CHECK-LABEL: @load_store_lds_f32(
+; CHECK: %tmp = load float, float addrspace(3)* @scalar, align 4
+; CHECK: call void @use(float %tmp)
+; CHECK: store float %v, float addrspace(3)* @scalar, align 4
+; CHECK: call void @llvm.amdgcn.s.barrier()
+; CHECK: %tmp2 = load float, float addrspace(3)* @scalar, align 4
+; CHECK: call void @use(float %tmp2)
+; CHECK: store float %v, float addrspace(3)* @scalar, align 4
+; CHECK: call void @llvm.amdgcn.s.barrier()
+; CHECK: %tmp3 = load float, float addrspace(3)* getelementptr inbounds ([10 x float], [10 x float] addrspace(3)* @array, i32 0, i32 5), align 4
+; CHECK: call void @use(float %tmp3)
+; CHECK: store float %v, float addrspace(3)* getelementptr inbounds ([10 x float], [10 x float] addrspace(3)* @array, i32 0, i32 5), align 4
+; CHECK: call void @llvm.amdgcn.s.barrier()
+; CHECK: %tmp4 = getelementptr inbounds [10 x float], [10 x float] addrspace(3)* @array, i32 0, i32 5
+; CHECK: %tmp5 = load float, float addrspace(3)* %tmp4, align 4
+; CHECK: call void @use(float %tmp5)
+; CHECK: store float %v, float addrspace(3)* %tmp4, align 4
+; CHECK: call void @llvm.amdgcn.s.barrier()
+; CHECK: %tmp7 = getelementptr inbounds [10 x float], [10 x float] addrspace(3)* @array, i32 0, i32 %i
+; CHECK: %tmp8 = load float, float addrspace(3)* %tmp7, align 4
+; CHECK: call void @use(float %tmp8)
+; CHECK: store float %v, float addrspace(3)* %tmp7, align 4
+; CHECK: call void @llvm.amdgcn.s.barrier()
+; CHECK: ret void
+define amdgpu_kernel void @load_store_lds_f32(i32 %i, float %v) #0 {
+bb:
+  %tmp = load float, float addrspace(4)* addrspacecast (float addrspace(3)* @scalar to float addrspace(4)*), align 4
+  call void @use(float %tmp)
+  store float %v, float addrspace(4)* addrspacecast (float addrspace(3)* @scalar to float addrspace(4)*), align 4
+  call void @llvm.amdgcn.s.barrier()
+  %tmp1 = addrspacecast float addrspace(3)* @scalar to float addrspace(4)*
+  %tmp2 = load float, float addrspace(4)* %tmp1, align 4
+  call void @use(float %tmp2)
+  store float %v, float addrspace(4)* %tmp1, align 4
+  call void @llvm.amdgcn.s.barrier()
+  %tmp3 = load float, float addrspace(4)* getelementptr inbounds ([10 x float], [10 x float] addrspace(4)* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float] addrspace(4)*), i32 0, i32 5), align 4
+  call void @use(float %tmp3)
+  store float %v, float addrspace(4)* getelementptr inbounds ([10 x float], [10 x float] addrspace(4)* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float] addrspace(4)*), i32 0, i32 5), align 4
+  call void @llvm.amdgcn.s.barrier()
+  %tmp4 = getelementptr inbounds [10 x float], [10 x float] addrspace(4)* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float] addrspace(4)*), i32 0, i32 5
+  %tmp5 = load float, float addrspace(4)* %tmp4, align 4
+  call void @use(float %tmp5)
+  store float %v, float addrspace(4)* %tmp4, align 4
+  call void @llvm.amdgcn.s.barrier()
+  %tmp6 = addrspacecast [10 x float] addrspace(3)* @array to [10 x float] addrspace(4)*
+  %tmp7 = getelementptr inbounds [10 x float], [10 x float] addrspace(4)* %tmp6, i32 0, i32 %i
+  %tmp8 = load float, float addrspace(4)* %tmp7, align 4
+  call void @use(float %tmp8)
+  store float %v, float addrspace(4)* %tmp7, align 4
+  call void @llvm.amdgcn.s.barrier()
+  ret void
+}
+
+; CHECK-LABEL: @constexpr_load_int_from_float_lds(
+; CHECK: %tmp = load i32, i32 addrspace(3)* bitcast (float addrspace(3)* @scalar to i32 addrspace(3)*), align 4
+define i32 @constexpr_load_int_from_float_lds() #0 {
+bb:
+  %tmp = load i32, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @scalar to i32 addrspace(3)*) to i32 addrspace(4)*), align 4
+  ret i32 %tmp
+}
+
+; CHECK-LABEL: @load_int_from_global_float(
+; CHECK: %tmp1 = getelementptr float, float addrspace(1)* %input, i32 %i
+; CHECK: %tmp2 = getelementptr float, float addrspace(1)* %tmp1, i32 %j
+; CHECK: %tmp3 = bitcast float addrspace(1)* %tmp2 to i32 addrspace(1)*
+; CHECK: %tmp4 = load i32, i32 addrspace(1)* %tmp3
+; CHECK: ret i32 %tmp4
+define i32 @load_int_from_global_float(float addrspace(1)* %input, i32 %i, i32 %j) #0 {
+bb:
+  %tmp = addrspacecast float addrspace(1)* %input to float addrspace(4)*
+  %tmp1 = getelementptr float, float addrspace(4)* %tmp, i32 %i
+  %tmp2 = getelementptr float, float addrspace(4)* %tmp1, i32 %j
+  %tmp3 = bitcast float addrspace(4)* %tmp2 to i32 addrspace(4)*
+  %tmp4 = load i32, i32 addrspace(4)* %tmp3
+  ret i32 %tmp4
+}
+
+; CHECK-LABEL: @nested_const_expr(
+; CHECK: store i32 1, i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds ([10 x float], [10 x float] addrspace(3)* @array, i64 0, i64 1) to i32 addrspace(3)*), align 4
+define amdgpu_kernel void @nested_const_expr() #0 {
+  store i32 1, i32 addrspace(4)* bitcast (float addrspace(4)* getelementptr ([10 x float], [10 x float] addrspace(4)* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float] addrspace(4)*), i64 0, i64 1) to i32 addrspace(4)*), align 4
+  ret void
+}
+
+; CHECK-LABEL: @rauw(
+; CHECK: %addr = getelementptr float, float addrspace(1)* %input, i64 10
+; CHECK-NEXT: %v = load float, float addrspace(1)* %addr
+; CHECK-NEXT: store float %v, float addrspace(1)* %addr
+; CHECK-NEXT: ret void
+define amdgpu_kernel void @rauw(float addrspace(1)* %input) #0 {
+bb:
+  %generic_input = addrspacecast float addrspace(1)* %input to float addrspace(4)*
+  %addr = getelementptr float, float addrspace(4)* %generic_input, i64 10
+  %v = load float, float addrspace(4)* %addr
+  store float %v, float addrspace(4)* %addr
+  ret void
+}
+
+; FIXME: Should be able to eliminate the cast inside the loop
+; CHECK-LABEL: @loop(
+
+; CHECK: %p = bitcast [10 x float] addrspace(3)* @array to float addrspace(3)*
+; CHECK: %end = getelementptr float, float addrspace(3)* %p, i64 10
+; CHECK: br label %loop
+
+; CHECK: loop:                                             ; preds = %loop, %entry
+; CHECK: %i = phi float addrspace(3)* [ %p, %entry ], [ %i2, %loop ]
+; CHECK: %v = load float, float addrspace(3)* %i
+; CHECK: call void @use(float %v)
+; CHECK: %i2 = getelementptr float, float addrspace(3)* %i, i64 1
+; CHECK: %exit_cond = icmp eq float addrspace(3)* %i2, %end
+
+; CHECK: br i1 %exit_cond, label %exit, label %loop
+define amdgpu_kernel void @loop() #0 {
+entry:
+  %p = addrspacecast [10 x float] addrspace(3)* @array to float addrspace(4)*
+  %end = getelementptr float, float addrspace(4)* %p, i64 10
+  br label %loop
+
+loop:                                             ; preds = %loop, %entry
+  %i = phi float addrspace(4)* [ %p, %entry ], [ %i2, %loop ]
+  %v = load float, float addrspace(4)* %i
+  call void @use(float %v)
+  %i2 = getelementptr float, float addrspace(4)* %i, i64 1
+  %exit_cond = icmp eq float addrspace(4)* %i2, %end
+  br i1 %exit_cond, label %exit, label %loop
+
+exit:                                             ; preds = %loop
+  ret void
+}
+
+@generic_end = external addrspace(1) global float addrspace(4)*
+
+; CHECK-LABEL: @loop_with_generic_bound(
+; CHECK: %p = bitcast [10 x float] addrspace(3)* @array to float addrspace(3)*
+; CHECK: %end = load float addrspace(4)*, float addrspace(4)* addrspace(1)* @generic_end
+; CHECK: br label %loop
+
+; CHECK: loop:
+; CHECK: %i = phi float addrspace(3)* [ %p, %entry ], [ %i2, %loop ]
+; CHECK: %v = load float, float addrspace(3)* %i
+; CHECK: call void @use(float %v)
+; CHECK: %i2 = getelementptr float, float addrspace(3)* %i, i64 1
+; CHECK: %0 = addrspacecast float addrspace(3)* %i2 to float addrspace(4)*
+; CHECK: %exit_cond = icmp eq float addrspace(4)* %0, %end
+; CHECK: br i1 %exit_cond, label %exit, label %loop
+define amdgpu_kernel void @loop_with_generic_bound() #0 {
+entry:
+  %p = addrspacecast [10 x float] addrspace(3)* @array to float addrspace(4)*
+  %end = load float addrspace(4)*, float addrspace(4)* addrspace(1)* @generic_end
+  br label %loop
+
+loop:                                             ; preds = %loop, %entry
+  %i = phi float addrspace(4)* [ %p, %entry ], [ %i2, %loop ]
+  %v = load float, float addrspace(4)* %i
+  call void @use(float %v)
+  %i2 = getelementptr float, float addrspace(4)* %i, i64 1
+  %exit_cond = icmp eq float addrspace(4)* %i2, %end
+  br i1 %exit_cond, label %exit, label %loop
+
+exit:                                             ; preds = %loop
+  ret void
+}
+
+declare void @llvm.amdgcn.s.barrier() #1
+declare void @use(float) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }
diff --git a/test/Transforms/InferAddressSpaces/AMDGPU/intrinsics.ll b/test/Transforms/InferAddressSpaces/AMDGPU/intrinsics.ll
new file mode 100644
index 000000000000..ca6138d3fb01
--- /dev/null
+++ b/test/Transforms/InferAddressSpaces/AMDGPU/intrinsics.ll
@@ -0,0 +1,146 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces %s | FileCheck %s
+
+; CHECK-LABEL: @objectsize_group_to_flat_i32(
+; CHECK: %val = call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* %group.ptr, i1 true, i1 false)
+define i32 @objectsize_group_to_flat_i32(i8 addrspace(3)* %group.ptr) #0 {
+  %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
+  %val = call i32 @llvm.objectsize.i32.p4i8(i8 addrspace(4)* %cast, i1 true, i1 false)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @objectsize_global_to_flat_i64(
+; CHECK: %val = call i64 @llvm.objectsize.i64.p3i8(i8 addrspace(3)* %global.ptr, i1 true, i1 false)
+define i64 @objectsize_global_to_flat_i64(i8 addrspace(3)* %global.ptr) #0 {
+  %cast = addrspacecast i8 addrspace(3)* %global.ptr to i8 addrspace(4)*
+  %val = call i64 @llvm.objectsize.i64.p4i8(i8 addrspace(4)* %cast, i1 true, i1 false)
+  ret i64 %val
+}
+
+; CHECK-LABEL: @atomicinc_global_to_flat_i32(
+; CHECK: call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %global.ptr, i32 %y, i32 0, i32 0, i1 false)
+define i32 @atomicinc_global_to_flat_i32(i32 addrspace(1)* %global.ptr, i32 %y) #0 {
+  %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)*
+  %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %cast, i32 %y, i32 0, i32 0, i1 false)
+  ret i32 %ret
+}
+
+; CHECK-LABEL: @atomicinc_group_to_flat_i32(
+; CHECK: %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %group.ptr, i32 %y, i32 0, i32 0, i1 false)
+define i32 @atomicinc_group_to_flat_i32(i32 addrspace(3)* %group.ptr, i32 %y) #0 {
+  %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)*
+  %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %cast, i32 %y, i32 0, i32 0, i1 false)
+  ret i32 %ret
+}
+
+; CHECK-LABEL: @atomicinc_global_to_flat_i64(
+; CHECK: call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %global.ptr, i64 %y, i32 0, i32 0, i1 false)
+define i64 @atomicinc_global_to_flat_i64(i64 addrspace(1)* %global.ptr, i64 %y) #0 {
+  %cast = addrspacecast i64 addrspace(1)* %global.ptr to i64 addrspace(4)*
+  %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 false)
+  ret i64 %ret
+}
+
+; CHECK-LABEL: @atomicinc_group_to_flat_i64(
+; CHECK: call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %group.ptr, i64 %y, i32 0, i32 0, i1 false)
+define i64 @atomicinc_group_to_flat_i64(i64 addrspace(3)* %group.ptr, i64 %y) #0 {
+  %cast = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)*
+  %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 false)
+  ret i64 %ret
+}
+
+; CHECK-LABEL: @atomicdec_global_to_flat_i32(
+; CHECK: call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %global.ptr, i32 %val, i32 0, i32 0, i1 false)
+define i32 @atomicdec_global_to_flat_i32(i32 addrspace(1)* %global.ptr, i32 %val) #0 {
+  %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)*
+  %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %cast, i32 %val, i32 0, i32 0, i1 false)
+  ret i32 %ret
+}
+
+; CHECK-LABEL: @atomicdec_group_to_flat_i32(
+; CHECK: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %group.ptr, i32 %val, i32 0, i32 0, i1 false)
+define i32 @atomicdec_group_to_flat_i32(i32 addrspace(3)* %group.ptr, i32 %val) #0 {
+  %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)*
+  %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %cast, i32 %val, i32 0, i32 0, i1 false)
+  ret i32 %ret
+}
+
+; CHECK-LABEL: @atomicdec_global_to_flat_i64(
+; CHECK: call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %global.ptr, i64 %y, i32 0, i32 0, i1 false)
+define i64 @atomicdec_global_to_flat_i64(i64 addrspace(1)* %global.ptr, i64 %y) #0 {
+  %cast = addrspacecast i64 addrspace(1)* %global.ptr to i64 addrspace(4)*
+  %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 false)
+  ret i64 %ret
+}
+
+; CHECK-LABEL: @atomicdec_group_to_flat_i64(
+; CHECK: call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %group.ptr, i64 %y, i32 0, i32 0, i1 false
+define i64 @atomicdec_group_to_flat_i64(i64 addrspace(3)* %group.ptr, i64 %y) #0 {
+  %cast = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)*
+  %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 false)
+  ret i64 %ret
+}
+
+; CHECK-LABEL: @volatile_atomicinc_group_to_flat_i64(
+; CHECK-NEXT: %1 = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)*
+; CHECK-NEXT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %1, i64 %y, i32 0, i32 0, i1 true)
+define i64 @volatile_atomicinc_group_to_flat_i64(i64 addrspace(3)* %group.ptr, i64 %y) #0 {
+  %cast = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)*
+  %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 true)
+  ret i64 %ret
+}
+
+; CHECK-LABEL: @volatile_atomicdec_global_to_flat_i32(
+; CHECK-NEXT: %1 = addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)*
+; CHECK-NEXT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %1, i32 %val, i32 0, i32 0, i1 true)
+define i32 @volatile_atomicdec_global_to_flat_i32(i32 addrspace(1)* %global.ptr, i32 %val) #0 {
+  %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)*
+  %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %cast, i32 %val, i32 0, i32 0, i1 true)
+  ret i32 %ret
+}
+
+; CHECK-LABEL: @volatile_atomicdec_group_to_flat_i32(
+; CHECK-NEXT: %1 = addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)*
+; CHECK-NEXT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %1, i32 %val, i32 0, i32 0, i1 true)
+define i32 @volatile_atomicdec_group_to_flat_i32(i32 addrspace(3)* %group.ptr, i32 %val) #0 {
+  %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)*
+  %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %cast, i32 %val, i32 0, i32 0, i1 true)
+  ret i32 %ret
+}
+
+; CHECK-LABEL: @volatile_atomicdec_global_to_flat_i64(
+; CHECK-NEXT: %1 = addrspacecast i64 addrspace(1)* %global.ptr to i64 addrspace(4)*
+; CHECK-NEXT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %1, i64 %y, i32 0, i32 0, i1 true)
+define i64 @volatile_atomicdec_global_to_flat_i64(i64 addrspace(1)* %global.ptr, i64 %y) #0 {
+  %cast = addrspacecast i64 addrspace(1)* %global.ptr to i64 addrspace(4)*
+  %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 true)
+  ret i64 %ret
+}
+
+; CHECK-LABEL: @volatile_atomicdec_group_to_flat_i64(
+; CHECK-NEXT: %1 = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)*
+; CHECK-NEXT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %1, i64 %y, i32 0, i32 0, i1 true)
+define i64 @volatile_atomicdec_group_to_flat_i64(i64 addrspace(3)* %group.ptr, i64 %y) #0 {
+  %cast = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)*
+  %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 true)
+  ret i64 %ret
+}
+
+; CHECK-LABEL: @invalid_variable_volatile_atomicinc_group_to_flat_i64(
+; CHECK-NEXT: %1 = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)*
+; CHECK-NEXT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %1, i64 %y, i32 0, i32 0, i1 %volatile.var)
+define i64 @invalid_variable_volatile_atomicinc_group_to_flat_i64(i64 addrspace(3)* %group.ptr, i64 %y, i1 %volatile.var) #0 {
+  %cast = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)*
+  %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 %volatile.var)
+  ret i64 %ret
+}
+
+declare i32 @llvm.objectsize.i32.p4i8(i8 addrspace(4)*, i1, i1) #1
+declare i64 @llvm.objectsize.i64.p4i8(i8 addrspace(4)*, i1, i1) #1
+declare i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* nocapture, i32, i32, i32, i1) #2
+declare i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* nocapture, i64, i32, i32, i1) #2
+declare i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* nocapture, i32, i32, i32, i1) #2
+declare i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* nocapture, i64, i32, i32, i1) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind argmemonly }
diff --git a/test/Transforms/InferAddressSpaces/AMDGPU/lit.local.cfg b/test/Transforms/InferAddressSpaces/AMDGPU/lit.local.cfg
new file mode 100644
index 000000000000..6baccf05fff0
--- /dev/null
+++ b/test/Transforms/InferAddressSpaces/AMDGPU/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll b/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
new file mode 100644
index 000000000000..557a80f1a5d1
--- /dev/null
+++ b/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
@@ -0,0 +1,134 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces %s | FileCheck %s
+
+; CHECK-LABEL: @memset_group_to_flat(
+; CHECK: call void @llvm.memset.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+define amdgpu_kernel void @memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 {
+  %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
+  call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+  ret void
+}
+
+; CHECK-LABEL: @memset_global_to_flat(
+; CHECK: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %global.ptr, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+define amdgpu_kernel void @memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 {
+  %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)*
+  call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+  ret void
+}
+
+; CHECK-LABEL: @memset_group_to_flat_no_md(
+; CHECK: call void @llvm.memset.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 4, i64 %size, i32 4, i1 false){{$}}
+define amdgpu_kernel void @memset_group_to_flat_no_md(i8 addrspace(3)* %group.ptr, i64 %size) #0 {
+  %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
+  call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 %size, i32 4, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @memset_global_to_flat_no_md(
+; CHECK: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %global.ptr, i8 4, i64 %size, i32 4, i1 false){{$}}
+define amdgpu_kernel void @memset_global_to_flat_no_md(i8 addrspace(1)* %global.ptr, i64 %size) #0 {
+  %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)*
+  call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 %size, i32 4, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group(
+; CHCK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+  %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
+  call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+  ret void
+}
+
+; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_with_group(
+; CHECK: call void @llvm.memcpy.p3i8.p4i8.i64(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(4)* %src.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(4)* %src.ptr, i64 %size) #0 {
+  %cast.dest = addrspacecast i8 addrspace(3)* %dest.group.ptr to i8 addrspace(4)*
+  call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %src.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+  ret void
+}
+
+; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_src_with_group(
+; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* %src.group.ptr, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_src_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+  %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
+  %cast.dest = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
+  call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+  ret void
+}
+
+; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_group_src_global(
+; CHECK: call void @llvm.memcpy.p3i8.p1i8.i64(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(1)* %src.global.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_group_src_global(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(1)* %src.global.ptr, i64 %size) #0 {
+  %cast.src = addrspacecast i8 addrspace(1)* %src.global.ptr to i8 addrspace(4)*
+  %cast.dest = addrspacecast i8 addrspace(3)* %dest.group.ptr to i8 addrspace(4)*
+  call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+  ret void
+}
+
+; CHECK-LABEL: @memcpy_group_to_flat_replace_dest_global(
+; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %dest.global.ptr, i8 addrspace(3)* %src.group.ptr, i32 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+define amdgpu_kernel void @memcpy_group_to_flat_replace_dest_global(i8 addrspace(1)* %dest.global.ptr, i8 addrspace(3)* %src.group.ptr, i32 %size) #0 {
+  %cast.dest = addrspacecast i8 addrspace(1)* %dest.global.ptr to i8 addrspace(4)*
+  call void @llvm.memcpy.p4i8.p3i8.i32(i8 addrspace(4)* %cast.dest, i8 addrspace(3)* %src.group.ptr, i32 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+  ret void
+}
+
+; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(
+; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa.struct !7
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+  %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
+  call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa.struct !7
+  ret void
+}
+
+; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group_no_md(
+; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}}
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+  %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
+  call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(
+; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest0, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}}
+; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}}
+define amdgpu_kernel void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest0, i8 addrspace(4)* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+  %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
+  call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest0, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false)
+  call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest1, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false)
+  ret void
+}
+
+; Check for iterator problems if the pointer has 2 uses in the same call
+; CHECK-LABEL: @memcpy_group_flat_to_flat_self(
+; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 addrspace(3)* %group.ptr, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+define amdgpu_kernel void @memcpy_group_flat_to_flat_self(i8 addrspace(3)* %group.ptr) #0 {
+  %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
+  call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast, i8 addrspace(4)* %cast, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+  ret void
+}
+; CHECK-LABEL: @memmove_flat_to_flat_replace_src_with_group(
+; CHECK: call void @llvm.memmove.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+  %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
+  call void @llvm.memmove.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+  ret void
+}
+
+declare void @llvm.memset.p4i8.i64(i8 addrspace(4)* nocapture writeonly, i8, i64, i32, i1) #1
+declare void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* nocapture writeonly, i8 addrspace(4)* nocapture readonly, i64, i32, i1) #1
+declare void @llvm.memcpy.p4i8.p3i8.i32(i8 addrspace(4)* nocapture writeonly, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #1
+declare void @llvm.memmove.p4i8.p4i8.i64(i8 addrspace(4)* nocapture writeonly, i8 addrspace(4)* nocapture readonly, i64, i32, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { argmemonly nounwind }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"A", !2}
+!2 = !{!"tbaa root"}
+!3 = !{!"B", !2}
+!4 = !{!5}
+!5 = distinct !{!5, !6, !"some scope"}
+!6 = distinct !{!6, !"some domain"}
+!7 = !{i64 0, i64 8, null}
+\ No newline at end of file
diff --git a/test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll b/test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll
new file mode 100644
index 000000000000..3231b6ccf1cc
--- /dev/null
+++ b/test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll
@@ -0,0 +1,143 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces %s | FileCheck %s
+
+; Regression tests from old HSAIL addrspacecast optimization pass
+
+@data = internal addrspace(1) global [100 x double] [double 0.00, double 1.000000e-01, double 2.000000e-01, double 3.000000e-01, double 4.000000e-01, double 5.000000e-01, double 6.000000e-01, double 7.000000e-01, double 8.000000e-01, double 9.000000e-01, double 1.00, double 1.10, double 1.20, double 1.30, double 1.40, double 1.50, double 1.60, double 1.70, double 1.80, double 1.90, double 2.00, double 2.10, double 2.20, double 2.30, double 2.40, double 2.50, double 2.60, double 2.70, double 2.80, double 2.90, double 3.00, double 3.10, double 3.20, double 3.30, double 3.40, double 3.50, double 3.60, double 3.70, double 3.80, double 3.90, double 4.00, double 4.10, double 4.20, double 4.30, double 4.40, double 4.50, double 4.60, double 4.70, double 4.80, double 4.90, double 5.00, double 5.10, double 5.20, double 5.30, double 5.40, double 5.50, double 5.60, double 5.70, double 5.80, double 5.90, double 6.00, double 6.10, double 6.20, double 6.30, double 6.40, double 6.50, double 6.60, double 6.70, double 6.80, double 6.90, double 7.00, double 7.10, double 7.20, double 7.30, double 7.40, double 7.50, double 7.60, double 7.70, double 7.80, double 7.90, double 8.00, double 8.10, double 8.20, double 8.30, double 8.40, double 8.50, double 8.60, double 8.70, double 8.80, double 8.90, double 9.00, double 9.10, double 9.20, double 9.30, double 9.40, double 9.50, double 9.60, double 9.70, double 9.80, double 9.90], align 8
+
+
+; Should generate flat load
+
+; CHECK-LABEL: @generic_address_bitcast_const(
+; CHECK: %vecload1 = load <2 x double>, <2 x double> addrspace(1)* bitcast (double addrspace(1)* getelementptr inbounds ([100 x double], [100 x double] addrspace(1)* @data, i64 0, i64 4) to <2 x double> addrspace(1)*), align 8
+define amdgpu_kernel void @generic_address_bitcast_const(i64 %arg0, i32 addrspace(1)* nocapture %results) #0 {
+entry:
+  %tmp1 = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 %tmp2, %arg0
+  %vecload1 = load <2 x double>, <2 x double> addrspace(4)* bitcast (double addrspace(4)* getelementptr ([100 x double], [100 x double] addrspace(4)* addrspacecast ([100 x double] addrspace(1)* @data to [100 x double] addrspace(4)*), i64 0, i64 4) to <2 x double> addrspace(4)*), align 8
+  %cmp = fcmp ord <2 x double> %vecload1, zeroinitializer
+  %sext = sext <2 x i1> %cmp to <2 x i64>
+  %tmp4 = extractelement <2 x i64> %sext, i64 0
+  %tmp5 = extractelement <2 x i64> %sext, i64 1
+  %tmp6 = and i64 %tmp4, %tmp5
+  %tmp7 = lshr i64 %tmp6, 63
+  %tmp8 = trunc i64 %tmp7 to i32
+  %idxprom = and i64 %tmp3, 4294967295
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %results, i64 %idxprom
+  store i32 %tmp8, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+@generic_address_bug9749.val = internal addrspace(1) global float 0.0, align 4
+
+declare i32 @_Z9get_fencePU3AS4v(i8 addrspace(4)*)
+%opencl.pipe_t = type opaque
+
+; This is a compile time assert bug, but we still want to check optimization
+; is performed to generate ld_global.
+; CHECK-LABEL: @generic_address_pipe_bug9673(
+; CHECK: %tmp1 = bitcast %opencl.pipe_t addrspace(3)* %in_pipe to i32 addrspace(3)*
+; CHECK: %add.ptr = getelementptr inbounds i32, i32 addrspace(3)* %tmp1, i32 2
+; CHECK: %tmp2 = load i32, i32 addrspace(3)* %add.ptr, align 4
+define amdgpu_kernel void @generic_address_pipe_bug9673(%opencl.pipe_t addrspace(3)* nocapture %in_pipe, i32 addrspace(1)* nocapture %dst) #0 {
+entry:
+  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = bitcast %opencl.pipe_t addrspace(3)* %in_pipe to i32 addrspace(3)*
+  %add.ptr = getelementptr inbounds i32, i32 addrspace(3)* %tmp1, i32 2
+  %tmp2 = load i32, i32 addrspace(3)* %add.ptr, align 4
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %dst, i32 %tmp
+  store i32 %tmp2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Should generate flat load
+; CHECK-LABEL: @generic_address_bug9749(
+; CHECK: br i1
+; CHECK: load float, float addrspace(4)*
+; CHECK: br label
+define amdgpu_kernel void @generic_address_bug9749(i32 addrspace(1)* nocapture %results) #0 {
+entry:
+  %ptr = alloca float addrspace(4)*, align 8
+  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  store float 0x3FB99999A0000000, float addrspace(1)* @generic_address_bug9749.val, align 4
+  store volatile float addrspace(4)* addrspacecast (float addrspace(1)* @generic_address_bug9749.val to float addrspace(4)*), float addrspace(4)** %ptr, align 8
+  %tmp2 = load volatile float addrspace(4)*, float addrspace(4)** %ptr, align 8
+  %tmp3 = load float, float addrspace(1)* @generic_address_bug9749.val, align 4
+  %tmp4 = bitcast float addrspace(4)* %tmp2 to i8 addrspace(4)*
+  %call.i = call i32 @_Z9get_fencePU3AS4v(i8 addrspace(4)* %tmp4) #1
+  %switch.i.i = icmp ult i32 %call.i, 4
+  br i1 %switch.i.i, label %if.end.i, label %helperFunction.exit
+
+if.end.i:                                         ; preds = %entry
+  %tmp5 = load float, float addrspace(4)* %tmp2, align 4
+  %not.cmp.i = fcmp oeq float %tmp5, %tmp3
+  %phitmp = zext i1 %not.cmp.i to i32
+  br label %helperFunction.exit
+
+helperFunction.exit:                              ; preds = %if.end.i, %entry
+  %retval.0.i = phi i32 [ 0, %entry ], [ %phitmp, %if.end.i ]
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %results, i64 %tmp1
+  store i32 %retval.0.i, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; CHECK-LABEL: @generic_address_opt_phi_bug9776_simple_phi_kernel(
+; CHECK: phi i32 addrspace(3)*
+; CHECK: store i32 %i.03, i32 addrspace(3)* %
+define amdgpu_kernel void @generic_address_opt_phi_bug9776_simple_phi_kernel(i32 addrspace(3)* nocapture %in, i32 %numElems) #0 {
+entry:
+  %cmp1 = icmp eq i32 %numElems, 0
+  br i1 %cmp1, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %tmp = addrspacecast i32 addrspace(3)* %in to i32 addrspace(4)*
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %i.03 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %ptr.02 = phi i32 addrspace(4)* [ %tmp, %for.body.lr.ph ], [ %add.ptr, %for.body ]
+  store i32 %i.03, i32 addrspace(4)* %ptr.02, align 4
+  %add.ptr = getelementptr inbounds i32, i32 addrspace(4)* %ptr.02, i64 4
+  %inc = add nuw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, %numElems
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; CHECK-LABEL: @generic_address_bug9899(
+; CHECK: %vecload = load <2 x i32>, <2 x i32> addrspace(3)*
+; CHECK: store <2 x i32> %tmp16, <2 x i32> addrspace(3)*
+define amdgpu_kernel void @generic_address_bug9899(i64 %arg0, i32 addrspace(3)* nocapture %sourceA, i32 addrspace(3)* nocapture %destValues) #0 {
+entry:
+  %tmp1 = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 %tmp2, %arg0
+  %sext = shl i64 %tmp3, 32
+  %tmp4 = addrspacecast i32 addrspace(3)* %destValues to i32 addrspace(4)*
+  %tmp5 = addrspacecast i32 addrspace(3)* %sourceA to i32 addrspace(4)*
+  %tmp6 = ashr exact i64 %sext, 31
+  %tmp7 = getelementptr inbounds i32, i32 addrspace(4)* %tmp5, i64 %tmp6
+  %arrayidx_v4 = bitcast i32 addrspace(4)* %tmp7 to <2 x i32> addrspace(4)*
+  %vecload = load <2 x i32>, <2 x i32> addrspace(4)* %arrayidx_v4, align 4
+  %tmp8 = extractelement <2 x i32> %vecload, i32 0
+  %tmp9 = extractelement <2 x i32> %vecload, i32 1
+  %tmp10 = icmp eq i32 %tmp8, 0
+  %tmp11 = select i1 %tmp10, i32 32, i32 %tmp8
+  %tmp12 = icmp eq i32 %tmp9, 0
+  %tmp13 = select i1 %tmp12, i32 32, i32 %tmp9
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(4)* %tmp4, i64 %tmp6
+  %tmp15 = insertelement <2 x i32> undef, i32 %tmp11, i32 0
+  %tmp16 = insertelement <2 x i32> %tmp15, i32 %tmp13, i32 1
+  %arrayidx_v41 = bitcast i32 addrspace(4)* %tmp14 to <2 x i32> addrspace(4)*
+  store <2 x i32> %tmp16, <2 x i32> addrspace(4)* %arrayidx_v41, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind readnone }
+\ No newline at end of file
diff --git a/test/Transforms/InferAddressSpaces/AMDGPU/select.ll b/test/Transforms/InferAddressSpaces/AMDGPU/select.ll
new file mode 100644
index 000000000000..08edc20ecf9b
--- /dev/null
+++ b/test/Transforms/InferAddressSpaces/AMDGPU/select.ll
@@ -0,0 +1,264 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces %s | FileCheck %s
+
+; Instcombine pulls the addrspacecast out of the select, make sure
+;  this doesn't do something insane on non-canonical IR.
+
+; CHECK-LABEL: @return_select_group_flat(
+; CHECK-NEXT: %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+; CHECK-NEXT: %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)*
+; CHECK-NEXT: %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* %cast1
+; CHECK-NEXT: ret i32 addrspace(4)* %select
+define i32 addrspace(4)* @return_select_group_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* %cast1
+  ret i32 addrspace(4)* %select
+}
+
+; CHECK-LABEL: @store_select_group_flat(
+; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1
+; CHECK: store i32 -1, i32 addrspace(3)* %select
+define amdgpu_kernel void @store_select_group_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* %cast1
+  store i32 -1, i32 addrspace(4)* %select
+  ret void
+}
+
+; Make sure metadata is preserved
+; CHECK-LABEL: @load_select_group_flat_md(
+; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1, !prof !0
+; CHECK: %load = load i32, i32 addrspace(3)* %select
+define i32 @load_select_group_flat_md(i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* %cast1, !prof !0
+  %load = load i32, i32 addrspace(4)* %select
+  ret i32 %load
+}
+
+; CHECK-LABEL: @store_select_mismatch_group_private_flat(
+; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+; CHECK: %2 = addrspacecast i32* %private.ptr.1 to i32 addrspace(4)*
+; CHECK: %select = select i1 %c, i32 addrspace(4)* %1, i32 addrspace(4)* %2
+; CHECK: store i32 -1, i32 addrspace(4)* %select
+define amdgpu_kernel void @store_select_mismatch_group_private_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32* %private.ptr.1) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cast1 = addrspacecast i32* %private.ptr.1 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* %cast1
+  store i32 -1, i32 addrspace(4)* %select
+  ret void
+}
+
+@lds0 = internal addrspace(3) global i32 123, align 4
+@lds1 = internal addrspace(3) global i32 456, align 4
+
+; CHECK-LABEL: @constexpr_select_group_flat(
+; CHECK: %tmp = load i32, i32 addrspace(3)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(3)* @lds0, i32 addrspace(3)* @lds1)
+define i32 @constexpr_select_group_flat() #0 {
+bb:
+  %tmp = load i32, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds1 to i32 addrspace(4)*))
+  ret i32 %tmp
+}
+
+; CHECK-LABEL: @constexpr_select_group_global_flat_mismatch(
+; CHECK: %tmp = load i32, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*))
+define i32 @constexpr_select_group_global_flat_mismatch() #0 {
+bb:
+  %tmp = load i32, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*))
+  ret i32 %tmp
+}
+
+; CHECK-LABEL: @store_select_group_flat_null(
+; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*)
+; CHECK: store i32 -1, i32 addrspace(3)* %select
+define amdgpu_kernel void @store_select_group_flat_null(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* null
+  store i32 -1, i32 addrspace(4)* %select
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_flat_null_swap(
+; CHECK: %select = select i1 %c, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*), i32 addrspace(3)* %group.ptr.0
+; CHECK: store i32 -1, i32 addrspace(3)* %select
+define amdgpu_kernel void @store_select_group_flat_null_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* null, i32 addrspace(4)* %cast0
+  store i32 -1, i32 addrspace(4)* %select
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_flat_undef(
+; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* undef
+; CHECK: store i32 -1, i32 addrspace(3)* %select
+define amdgpu_kernel void @store_select_group_flat_undef(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* undef
+  store i32 -1, i32 addrspace(4)* %select
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_flat_undef_swap(
+; CHECK: %select = select i1 %c, i32 addrspace(3)* undef, i32 addrspace(3)* %group.ptr.0
+; CHECK: store i32 -1, i32 addrspace(3)* %select
+define amdgpu_kernel void @store_select_group_flat_undef_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* undef, i32 addrspace(4)* %cast0
+  store i32 -1, i32 addrspace(4)* %select
+  ret void
+}
+
+; CHECK-LABEL: @store_select_gep_group_flat_null(
+; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*)
+; CHECK: %gep = getelementptr i32, i32 addrspace(3)* %select, i64 16
+; CHECK: store i32 -1, i32 addrspace(3)* %gep
+define amdgpu_kernel void @store_select_gep_group_flat_null(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* null
+  %gep = getelementptr i32, i32 addrspace(4)* %select, i64 16
+  store i32 -1, i32 addrspace(4)* %gep
+  ret void
+}
+
+@global0 = internal addrspace(1) global i32 123, align 4
+
+; CHECK-LABEL: @store_select_group_flat_constexpr(
+; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* @lds1
+; CHECK: store i32 7, i32 addrspace(3)* %select
+define amdgpu_kernel void @store_select_group_flat_constexpr(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds1 to i32 addrspace(4)*)
+  store i32 7, i32 addrspace(4)* %select
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_flat_inttoptr_flat(
+; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* inttoptr (i64 12345 to i32 addrspace(4)*) to i32 addrspace(3)*)
+; CHECK: store i32 7, i32 addrspace(3)* %select
+define amdgpu_kernel void @store_select_group_flat_inttoptr_flat(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* inttoptr (i64 12345 to i32 addrspace(4)*)
+  store i32 7, i32 addrspace(4)* %select
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_flat_inttoptr_group(
+; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* inttoptr (i32 400 to i32 addrspace(3)*)
+; CHECK-NEXT: store i32 7, i32 addrspace(3)* %select
+define amdgpu_kernel void @store_select_group_flat_inttoptr_group(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* inttoptr (i32 400 to i32 addrspace(3)*) to i32 addrspace(4)*)
+  store i32 7, i32 addrspace(4)* %select
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_global_mismatch_flat_constexpr(
+; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+; CHECK: %select = select i1 %c, i32 addrspace(4)* %1, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)
+; CHECK: store i32 7, i32 addrspace(4)* %select
+define amdgpu_kernel void @store_select_group_global_mismatch_flat_constexpr(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)
+  store i32 7, i32 addrspace(4)* %select
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_global_mismatch_flat_constexpr_swap(
+; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+; CHECK: %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*), i32 addrspace(4)* %1
+; CHECK: store i32 7, i32 addrspace(4)* %select
+define amdgpu_kernel void @store_select_group_global_mismatch_flat_constexpr_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*), i32 addrspace(4)* %cast0
+  store i32 7, i32 addrspace(4)* %select
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_global_mismatch_null_null(
+; CHECK: %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)
+; CHECK: store i32 7, i32 addrspace(4)* %select
+define amdgpu_kernel void @store_select_group_global_mismatch_null_null(i1 %c) #0 {
+  %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)
+  store i32 7, i32 addrspace(4)* %select
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_global_mismatch_null_null_constexpr(
+; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
+define amdgpu_kernel void @store_select_group_global_mismatch_null_null_constexpr() #0 {
+  store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_global_mismatch_gv_null_constexpr(
+; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
+define amdgpu_kernel void @store_select_group_global_mismatch_gv_null_constexpr() #0 {
+  store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_global_mismatch_null_gv_constexpr(
+; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)), align 4
+define amdgpu_kernel void @store_select_group_global_mismatch_null_gv_constexpr() #0 {
+  store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)), align 4
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_global_mismatch_inttoptr_null_constexpr(
+; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* inttoptr (i64 123 to i32 addrspace(3)*) to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
+define amdgpu_kernel void @store_select_group_global_mismatch_inttoptr_null_constexpr() #0 {
+  store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* inttoptr (i64 123 to i32 addrspace(3)*) to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_global_mismatch_inttoptr_flat_null_constexpr(
+; CHECK: store i32 7, i32 addrspace(1)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(1)* addrspacecast (i32 addrspace(4)* inttoptr (i64 123 to i32 addrspace(4)*) to i32 addrspace(1)*), i32 addrspace(1)* null), align 4
+define amdgpu_kernel void @store_select_group_global_mismatch_inttoptr_flat_null_constexpr() #0 {
+  store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* inttoptr (i64 123 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_global_mismatch_undef_undef_constexpr(
+; CHECK: store i32 7, i32 addrspace(3)* null
+define amdgpu_kernel void @store_select_group_global_mismatch_undef_undef_constexpr() #0 {
+  store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* undef to i32 addrspace(4)*)), align 4
+  ret void
+}
+
+@lds2 = external addrspace(3) global [1024 x i32], align 4
+
+; CHECK-LABEL: @store_select_group_constexpr_ptrtoint(
+; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+; CHECK: %select = select i1 %c, i32 addrspace(4)* %1, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* inttoptr (i32 add (i32 ptrtoint ([1024 x i32] addrspace(3)* @lds2 to i32), i32 124) to i32 addrspace(1)*) to i32 addrspace(4)*)
+; CHECK: store i32 7, i32 addrspace(4)* %select
+define amdgpu_kernel void @store_select_group_constexpr_ptrtoint(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* inttoptr (i32 add (i32 ptrtoint ([1024 x i32] addrspace(3)* @lds2 to i32), i32 124) to i32 addrspace(1)*) to i32 addrspace(4)*)
+  store i32 7, i32 addrspace(4)* %select
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_flat_vector(
+; CHECK: %cast0 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.0 to <2 x i32 addrspace(4)*>
+; CHECK: %cast1 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.1 to <2 x i32 addrspace(4)*>
+; CHECK: %select = select i1 %c, <2 x i32 addrspace(4)*> %cast0, <2 x i32 addrspace(4)*> %cast1
+; CHECK: %extract0 = extractelement <2 x i32 addrspace(4)*> %select, i32 0
+; CHECK: %extract1 = extractelement <2 x i32 addrspace(4)*> %select, i32 1
+; CHECK: store i32 -1, i32 addrspace(4)* %extract0
+; CHECK: store i32 -2, i32 addrspace(4)* %extract1
+define amdgpu_kernel void @store_select_group_flat_vector(i1 %c, <2 x i32 addrspace(3)*> %group.ptr.0, <2 x i32 addrspace(3)*> %group.ptr.1) #0 {
+  %cast0 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.0 to <2 x i32 addrspace(4)*>
+  %cast1 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.1 to <2 x i32 addrspace(4)*>
+  %select = select i1 %c, <2 x i32 addrspace(4)*> %cast0, <2 x i32 addrspace(4)*> %cast1
+  %extract0 = extractelement <2 x i32 addrspace(4)*> %select, i32 0
+  %extract1 = extractelement <2 x i32 addrspace(4)*> %select, i32 1
+  store i32 -1, i32 addrspace(4)* %extract0
+  store i32 -2, i32 addrspace(4)* %extract1
+  ret void
+}
+
+attributes #0 = { nounwind }
+
+!0 = !{!"branch_weights", i32 2, i32 10}
diff --git a/test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll b/test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll
new file mode 100644
index 000000000000..79bf92610a8d
--- /dev/null
+++ b/test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll
@@ -0,0 +1,140 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces %s | FileCheck %s
+
+; Check that volatile users of addrspacecast are not replaced.
+
+; CHECK-LABEL: @volatile_load_flat_from_global(
+; CHECK: load volatile i32, i32 addrspace(4)*
+; CHECK: store i32 %val, i32 addrspace(1)*
+define amdgpu_kernel void @volatile_load_flat_from_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
+  %tmp0 = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
+  %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)*
+  %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4
+  store i32 %val, i32 addrspace(4)* %tmp1, align 4
+  ret void
+}
+
+; CHECK-LABEL: @volatile_load_flat_from_constant(
+; CHECK: load volatile i32, i32 addrspace(4)*
+; CHECK: store i32 %val, i32 addrspace(1)*
+define amdgpu_kernel void @volatile_load_flat_from_constant(i32 addrspace(2)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
+  %tmp0 = addrspacecast i32 addrspace(2)* %input to i32 addrspace(4)*
+  %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)*
+  %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4
+  store i32 %val, i32 addrspace(4)* %tmp1, align 4
+  ret void
+}
+
+; CHECK-LABEL: @volatile_load_flat_from_group(
+; CHECK: load volatile i32, i32 addrspace(4)*
+; CHECK: store i32 %val, i32 addrspace(3)*
+define amdgpu_kernel void @volatile_load_flat_from_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
+  %tmp0 = addrspacecast i32 addrspace(3)* %input to i32 addrspace(4)*
+  %tmp1 = addrspacecast i32 addrspace(3)* %output to i32 addrspace(4)*
+  %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4
+  store i32 %val, i32 addrspace(4)* %tmp1, align 4
+  ret void
+}
+
+; CHECK-LABEL: @volatile_load_flat_from_private(
+; CHECK: load volatile i32, i32 addrspace(4)*
+; CHECK: store i32 %val, i32*
+define amdgpu_kernel void @volatile_load_flat_from_private(i32* nocapture %input, i32* nocapture %output) #0 {
+  %tmp0 = addrspacecast i32* %input to i32 addrspace(4)*
+  %tmp1 = addrspacecast i32* %output to i32 addrspace(4)*
+  %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4
+  store i32 %val, i32 addrspace(4)* %tmp1, align 4
+  ret void
+}
+
+; CHECK-LABEL: @volatile_store_flat_to_global(
+; CHECK: load i32, i32 addrspace(1)*
+; CHECK: store volatile i32 %val, i32 addrspace(4)*
+define amdgpu_kernel void @volatile_store_flat_to_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
+  %tmp0 = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
+  %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)*
+  %val = load i32, i32 addrspace(4)* %tmp0, align 4
+  store volatile i32 %val, i32 addrspace(4)* %tmp1, align 4
+  ret void
+}
+
+; CHECK-LABEL: @volatile_store_flat_to_group(
+; CHECK: load i32, i32 addrspace(3)*
+; CHECK: store volatile i32 %val, i32 addrspace(4)*
+define amdgpu_kernel void @volatile_store_flat_to_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
+  %tmp0 = addrspacecast i32 addrspace(3)* %input to i32 addrspace(4)*
+  %tmp1 = addrspacecast i32 addrspace(3)* %output to i32 addrspace(4)*
+  %val = load i32, i32 addrspace(4)* %tmp0, align 4
+  store volatile i32 %val, i32 addrspace(4)* %tmp1, align 4
+  ret void
+}
+
+; CHECK-LABEL: @volatile_store_flat_to_private(
+; CHECK: load i32, i32*
+; CHECK: store volatile i32 %val, i32 addrspace(4)*
+define amdgpu_kernel void @volatile_store_flat_to_private(i32* nocapture %input, i32* nocapture %output) #0 {
+  %tmp0 = addrspacecast i32* %input to i32 addrspace(4)*
+  %tmp1 = addrspacecast i32* %output to i32 addrspace(4)*
+  %val = load i32, i32 addrspace(4)* %tmp0, align 4
+  store volatile i32 %val, i32 addrspace(4)* %tmp1, align 4
+  ret void
+}
+
+; CHECK-LABEL: @volatile_atomicrmw_add_group_to_flat(
+; CHECK: addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)*
+; CHECK: atomicrmw volatile add i32 addrspace(4)*
+define i32 @volatile_atomicrmw_add_group_to_flat(i32 addrspace(3)* %group.ptr, i32 %y) #0 {
+  %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)*
+  %ret = atomicrmw volatile add i32 addrspace(4)* %cast, i32 %y seq_cst
+  ret i32 %ret
+}
+
+; CHECK-LABEL: @volatile_atomicrmw_add_global_to_flat(
+; CHECK: addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)*
+; CHECK: %ret = atomicrmw volatile add i32 addrspace(4)*
+define i32 @volatile_atomicrmw_add_global_to_flat(i32 addrspace(1)* %global.ptr, i32 %y) #0 {
+  %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)*
+  %ret = atomicrmw volatile add i32 addrspace(4)* %cast, i32 %y seq_cst
+  ret i32 %ret
+}
+
+; CHECK-LABEL: @volatile_cmpxchg_global_to_flat(
+; CHECK: addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)*
+; CHECK: cmpxchg volatile i32 addrspace(4)*
+define { i32, i1 } @volatile_cmpxchg_global_to_flat(i32 addrspace(1)* %global.ptr, i32 %cmp, i32 %val) #0 {
+  %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)*
+  %ret = cmpxchg volatile i32 addrspace(4)* %cast, i32 %cmp, i32 %val seq_cst monotonic
+  ret { i32, i1 } %ret
+}
+
+; CHECK-LABEL: @volatile_cmpxchg_group_to_flat(
+; CHECK: addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)*
+; CHECK: cmpxchg volatile i32 addrspace(4)*
+define { i32, i1 } @volatile_cmpxchg_group_to_flat(i32 addrspace(3)* %group.ptr, i32 %cmp, i32 %val) #0 {
+  %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)*
+  %ret = cmpxchg volatile i32 addrspace(4)* %cast, i32 %cmp, i32 %val seq_cst monotonic
+  ret { i32, i1 } %ret
+}
+
+; FIXME: Shouldn't be losing names
+; CHECK-LABEL: @volatile_memset_group_to_flat(
+; CHECK: addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
+; CHECK: call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %1, i8 4, i64 32, i32 4, i1 true)
+define amdgpu_kernel void @volatile_memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 {
+  %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
+  call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 true)
+  ret void
+}
+
+; CHECK-LABEL: @volatile_memset_global_to_flat(
+; CHECK: addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)*
+; CHECK: call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %1, i8 4, i64 32, i32 4, i1 true)
+define amdgpu_kernel void @volatile_memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 {
+  %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)*
+  call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 true)
+  ret void
+}
+
+declare void @llvm.memset.p4i8.i64(i8 addrspace(4)* nocapture writeonly, i8, i64, i32, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { argmemonly nounwind }
diff --git a/test/Transforms/InferAddressSpaces/NVPTX/bug31948.ll b/test/Transforms/InferAddressSpaces/NVPTX/bug31948.ll
new file mode 100644
index 000000000000..b2d8ddb19565
--- /dev/null
+++ b/test/Transforms/InferAddressSpaces/NVPTX/bug31948.ll
@@ -0,0 +1,24 @@
+; RUN: opt -S -mtriple=nvptx64-nvidia-cuda -infer-address-spaces %s | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+
+%struct.bar = type { float, float* }
+
+@var1 = local_unnamed_addr addrspace(3) externally_initialized global %struct.bar undef, align 8
+
+; CHECK-LABEL: @bug31948(
+; CHECK: %tmp = load float*, float* addrspace(3)* getelementptr inbounds (%struct.bar, %struct.bar addrspace(3)* @var1, i64 0, i32 1), align 8
+; CHECK: %tmp1 = load float, float* %tmp, align 4
+; CHECK: store float %conv1, float* %tmp, align 4
+; CHECK: store i32 32, i32 addrspace(3)* addrspacecast (i32* bitcast (float** getelementptr (%struct.bar, %struct.bar* addrspacecast (%struct.bar addrspace(3)* @var1 to %struct.bar*), i64 0, i32 1) to i32*) to i32 addrspace(3)*), align 4
+define void @bug31948(float %a, float* nocapture readnone %x, float* nocapture readnone %y) local_unnamed_addr #0 {
+entry:
+  %tmp = load float*, float** getelementptr (%struct.bar, %struct.bar* addrspacecast (%struct.bar addrspace(3)* @var1 to %struct.bar*), i64 0, i32 1), align 8
+  %tmp1 = load float, float* %tmp, align 4
+  %conv1 = fadd float %tmp1, 1.000000e+00
+  store float %conv1, float* %tmp, align 4
+  store i32 32, i32* bitcast (float** getelementptr (%struct.bar, %struct.bar* addrspacecast (%struct.bar addrspace(3)* @var1 to %struct.bar*), i64 0, i32 1) to i32*), align 4
+  ret void
+}
+
+attributes #0 = { norecurse nounwind }
diff --git a/test/Transforms/InferAddressSpaces/NVPTX/lit.local.cfg b/test/Transforms/InferAddressSpaces/NVPTX/lit.local.cfg
new file mode 100644
index 000000000000..2cb98eb371b2
--- /dev/null
+++ b/test/Transforms/InferAddressSpaces/NVPTX/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'NVPTX' in config.root.targets:
+    config.unsupported = True
diff --git a/test/Transforms/Inline/AArch64/gep-cost.ll b/test/Transforms/Inline/AArch64/gep-cost.ll
new file mode 100644
index 000000000000..204958f082dd
--- /dev/null
+++ b/test/Transforms/Inline/AArch64/gep-cost.ll
@@ -0,0 +1,30 @@
+; REQUIRES: asserts
+; RUN: opt -inline -mtriple=aarch64--linux-gnu -mcpu=kryo -S -debug-only=inline-cost < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+define void @outer([4 x i32]* %ptr, i32 %i) {
+  call void @inner1([4 x i32]* %ptr, i32 %i)
+  call void @inner2([4 x i32]* %ptr, i32 %i)
+  ret void
+}
+; The gep in inner1() is reg+reg, which is a legal addressing mode for AArch64.
+; Thus, both the gep and ret can be simplified.
+; CHECK: Analyzing call of inner1
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 2
+define void @inner1([4 x i32]* %ptr, i32 %i) {
+  %G = getelementptr inbounds [4 x i32], [4 x i32]* %ptr, i32 0, i32 %i
+  ret void
+}
+
+; The gep in inner2() is reg+imm+reg, which is not a legal addressing mode for 
+; AArch64.  Thus, only the ret can be simplified and not the gep.
+; CHECK: Analyzing call of inner2
+; CHECK: NumInstructionsSimplified: 1
+; CHECK: NumInstructions: 2
+define void @inner2([4 x i32]* %ptr, i32 %i) {
+  %G = getelementptr inbounds [4 x i32], [4 x i32]* %ptr, i32 1, i32 %i
+  ret void
+}
diff --git a/test/Transforms/Inline/AArch64/lit.local.cfg b/test/Transforms/Inline/AArch64/lit.local.cfg
new file mode 100644
index 000000000000..7184443994b6
--- /dev/null
+++ b/test/Transforms/Inline/AArch64/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True
diff --git a/test/Transforms/Inline/alloca-bonus.ll b/test/Transforms/Inline/alloca-bonus.ll
index 542dcee0fcb2..c5c2ce11cc5b 100644
--- a/test/Transforms/Inline/alloca-bonus.ll
+++ b/test/Transforms/Inline/alloca-bonus.ll
@@ -3,7 +3,7 @@
 
 target datalayout = "p:32:32"
 
-declare void @llvm.lifetime.start(i64 %size, i8* nocapture %ptr)
+declare void @llvm.lifetime.start.p0i8(i64 %size, i8* nocapture %ptr)
 
 @glbl = external global i32
 
@@ -22,7 +22,7 @@ define void @inner1(i32 *%ptr) {
   %D = getelementptr inbounds i32, i32* %ptr, i32 1
   %E = bitcast i32* %ptr to i8*
   %F = select i1 false, i32* %ptr, i32* @glbl
-  call void @llvm.lifetime.start(i64 0, i8* %E)
+  call void @llvm.lifetime.start.p0i8(i64 0, i8* %E)
   call void @extern()
   ret void
 }
@@ -43,7 +43,7 @@ define void @inner2(i32 *%ptr) {
   %D = getelementptr inbounds i32, i32* %ptr, i32 %A
   %E = bitcast i32* %ptr to i8*
   %F = select i1 false, i32* %ptr, i32* @glbl
-  call void @llvm.lifetime.start(i64 0, i8* %E)
+  call void @llvm.lifetime.start.p0i8(i64 0, i8* %E)
   call void @extern()
   ret void
 }
@@ -152,7 +152,7 @@ if.then:
   %D = getelementptr inbounds i32, i32* %ptr, i32 %A
   %E = bitcast i32* %ptr to i8*
   %F = select i1 false, i32* %ptr, i32* @glbl
-  call void @llvm.lifetime.start(i64 0, i8* %E)
+  call void @llvm.lifetime.start.p0i8(i64 0, i8* %E)
   ret void
 
 exit:
diff --git a/test/Transforms/Inline/arg-attr-propagation.ll b/test/Transforms/Inline/arg-attr-propagation.ll
new file mode 100644
index 000000000000..3d18e8047e5b
--- /dev/null
+++ b/test/Transforms/Inline/arg-attr-propagation.ll
@@ -0,0 +1,50 @@
+; RUN: opt -inline -S < %s | FileCheck %s
+
+; The callee guarantees that the pointer argument is nonnull and dereferenceable.
+; That information should transfer to the caller.
+
+define i32 @callee(i32* dereferenceable(32) %t1) {
+; CHECK-LABEL: @callee(i32* dereferenceable(32) %t1)
+; CHECK-NEXT:    [[T2:%.*]] = load i32, i32* %t1
+; CHECK-NEXT:    ret i32 [[T2]]
+;
+  %t2 = load i32, i32* %t1
+  ret i32 %t2
+}
+
+; FIXME: All dereferenceability information is lost.
+; The caller argument could be known nonnull and dereferenceable(32).
+
+define i32 @caller1(i32* %t1) {
+; CHECK-LABEL: @caller1(i32* %t1)
+; CHECK-NEXT:    [[T2_I:%.*]] = load i32, i32* %t1
+; CHECK-NEXT:    ret i32 [[T2_I]]
+;
+  %t2 = tail call i32 @callee(i32* dereferenceable(32) %t1)
+  ret i32 %t2
+}
+
+; The caller argument is nonnull, but that can be explicit.
+; The dereferenceable amount could be increased.
+
+define i32 @caller2(i32* dereferenceable(31) %t1) {
+; CHECK-LABEL: @caller2(i32* dereferenceable(31) %t1)
+; CHECK-NEXT:    [[T2_I:%.*]] = load i32, i32* %t1
+; CHECK-NEXT:    ret i32 [[T2_I]]
+;
+  %t2 = tail call i32 @callee(i32* dereferenceable(32) %t1)
+  ret i32 %t2
+}
+
+; The caller argument is nonnull, but that can be explicit.
+; Make sure that we don't propagate a smaller dereferenceable amount.
+
+define i32 @caller3(i32* dereferenceable(33) %t1) {
+; CHECK-LABEL: @caller3(i32* dereferenceable(33) %t1)
+; CHECK-NEXT:    [[T2_I:%.*]] = load i32, i32* %t1
+; CHECK-NEXT:    ret i32 [[T2_I]]
+;
+  %t2 = tail call i32 @callee(i32* dereferenceable(32) %t1)
+  ret i32 %t2
+}
+
diff --git a/test/Transforms/Inline/bfi-update.ll b/test/Transforms/Inline/bfi-update.ll
new file mode 100644
index 000000000000..94584e2e6ce5
--- /dev/null
+++ b/test/Transforms/Inline/bfi-update.ll
@@ -0,0 +1,93 @@
+; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -S -inline-threshold=50 -inline-cold-callsite-threshold=0 -hot-callsite-threshold=50 | FileCheck %s
+; This tests incremental updates to caller's BFI as a callee gets inlined.
+; In bottom-up inlining, first c->e inlining is considered and fails because
+; e's size exceeds the threshold of 50. Then a->c inlining is considered and it
+; succeeds. a's BFI is updated incrementally. As c's blocks get pruned, the 
+; block with label cond_false is removed and since the remanining code is
+; straight-line a single block gets cloned into a. This block should get the
+; maximum block frequency among the original blocks in c. If it gets the
+; frequency of the block with label cond_true in @c, its frequency will be
+; 1/10th of function a's entry block frequency, resulting in a callsite count of
+; 2 (since a's entry count is 20) which means that a->e callsite will be
+; considered cold and not inlined. 
+
+@data = external global i32
+; CHECK-LABEL: define i32 @a(
+define i32 @a(i32 %a1) !prof !21 {
+; CHECK-NOT: call i32 @c
+; CHECK-NOT: call i32 @e
+; CHECK: ret
+entry:
+  %cond = icmp sle i32 %a1, 1
+  %a2 = call i32 @c(i32 1)
+  br label %exit
+exit:
+  ret i32 %a2
+}
+
+declare void @ext();
+
+; CHECK: @c(i32 %c1) !prof [[COUNT1:![0-9]+]]
+define i32 @c(i32 %c1) !prof !23 {
+  call void @ext()
+  %cond = icmp sle i32 %c1, 1
+  br i1 %cond, label %cond_true, label %cond_false, !prof !25
+
+cond_false:
+  br label %exit
+
+cond_true:
+  %c11 = call i32 @e(i32 %c1)
+  br label %exit
+exit:
+  %c12 = phi i32 [ 0, %cond_false], [ %c11, %cond_true ]
+  ret i32 %c12
+}
+
+
+; CHECK: @e(i32 %c1) !prof [[COUNT2:![0-9]+]]
+define i32 @e(i32 %c1) !prof !24 {
+  call void @ext()
+  call void @ext()
+  %cond = icmp sle i32 %c1, 1
+  br i1 %cond, label %cond_true, label %cond_false
+
+cond_false:
+  call void @ext()
+  %c2 = load i32, i32* @data, align 4
+  %c3 = add i32 %c1, %c2
+  %c4 = mul i32 %c3, %c2
+  %c5 = add i32 %c4, %c2
+  %c6 = mul i32 %c5, %c2
+  %c7 = add i32 %c6, %c2
+  %c8 = mul i32 %c7, %c2
+  %c9 = add i32 %c8, %c2
+  %c10 = mul i32 %c9, %c2
+  ret i32 %c10
+
+cond_true:
+  ret i32 0
+}
+
+; CHECK: [[COUNT1]] = !{!"function_entry_count", i64 480}
+; CHECK: [[COUNT2]] = !{!"function_entry_count", i64 80}
+!21 = !{!"function_entry_count", i64 20}
+!23 = !{!"function_entry_count", i64 500}
+!24 = !{!"function_entry_count", i64 100}
+!25 = !{!"branch_weights", i32 1, i32 9}
+
+!llvm.module.flags = !{!1}
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 10000}
+!5 = !{!"MaxCount", i64 1000}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 1000}
+!8 = !{!"NumCounts", i64 3}
+!9 = !{!"NumFunctions", i64 3}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 1000, i32 1}
+!13 = !{i32 999000, i64 1000, i32 1}
+!14 = !{i32 999999, i64 5, i32 2}
diff --git a/test/Transforms/Inline/cgscc-incremental-invalidate.ll b/test/Transforms/Inline/cgscc-incremental-invalidate.ll
new file mode 100644
index 000000000000..82d321ccf225
--- /dev/null
+++ b/test/Transforms/Inline/cgscc-incremental-invalidate.ll
@@ -0,0 +1,111 @@
+; Test for a subtle bug when computing analyses during inlining and mutating
+; the SCC structure. Without care, this can fail to invalidate analyses.
+;
+; RUN: opt < %s -passes='cgscc(inline,function(verify<domtree>))' -debug-pass-manager -S 2>&1 | FileCheck %s
+
+; First we check that the passes run in the way we expect. Otherwise this test
+; may stop testing anything.
+;
+; CHECK-LABEL: Starting llvm::Module pass manager run.
+; CHECK: Running pass: InlinerPass on (test1_f, test1_g, test1_h)
+; CHECK: Running analysis: FunctionAnalysisManagerCGSCCProxy on (test1_f, test1_g, test1_h)
+; CHECK: Running analysis: DominatorTreeAnalysis on test1_f
+; CHECK: Running analysis: DominatorTreeAnalysis on test1_g
+; CHECK: Invalidating all non-preserved analyses for: (test1_f, test1_g, test1_h)
+; CHECK: Invalidating all non-preserved analyses for: test1_f
+; CHECK: Invalidating analysis: DominatorTreeAnalysis on test1_f
+; CHECK: Invalidating all non-preserved analyses for: test1_g
+; CHECK: Invalidating analysis: DominatorTreeAnalysis on test1_g
+; CHECK: Invalidating all non-preserved analyses for: test1_h
+; CHECK-NOT: Invalidating anaylsis:
+; CHECK: Running analysis: DominatorTreeAnalysis on test1_h
+; CHECK: Invalidating all non-preserved analyses for: (test1_g, test1_h)
+; CHECK: Invalidating all non-preserved analyses for: test1_h
+; CHECK: Invalidating analysis: DominatorTreeAnalysis on test1_h
+
+; An external function used to control branches.
+declare i1 @flag()
+; CHECK-LABEL: declare i1 @flag()
+
+; The utility function with interesting control flow that gets inlined below to
+; perturb the dominator tree.
+define internal void @callee() {
+entry:
+  %ptr = alloca i8
+  %flag = call i1 @flag()
+  br i1 %flag, label %then, label %else
+
+then:
+  store volatile i8 42, i8* %ptr
+  br label %return
+
+else:
+  store volatile i8 -42, i8* %ptr
+  br label %return
+
+return:
+  ret void
+}
+
+; The 'test1_' prefixed functions work to carefully test that incrementally
+; reducing an SCC in the inliner cannot accidentially leave stale function
+; analysis results due to failing to invalidate them for all the functions.
+
+; The inliner visits this last function. It can't actually break any cycles
+; here, but because we visit this function we compute fresh analyses for it.
+; These analyses are then invalidated when we inline callee disrupting the
+; CFG, and it is important that they be freed.
+define void @test1_h() {
+; CHECK-LABEL: define void @test1_h()
+entry:
+  call void @test1_g()
+; CHECK: call void @test1_g()
+
+  ; Pull interesting CFG into this function.
+  call void @callee()
+; CHECK-NOT: call void @callee()
+
+  ret void
+; CHECK: ret void
+}
+
+; We visit this function second and here we inline the edge to 'test1_f'
+; separating it into its own SCC. The current SCC is now just 'test1_g' and
+; 'test1_h'.
+define void @test1_g() {
+; CHECK-LABEL: define void @test1_g()
+entry:
+  ; This edge gets inlined away.
+  call void @test1_f()
+; CHECK-NOT: call void @test1_f()
+; CHECK: call void @test1_g()
+
+  ; We force this edge to survive inlining.
+  call void @test1_h() noinline
+; CHECK: call void @test1_h()
+
+  ; Pull interesting CFG into this function.
+  call void @callee()
+; CHECK-NOT: call void @callee()
+
+  ret void
+; CHECK: ret void
+}
+
+; We visit this function first in the inliner, and while we inline callee
+; perturbing the CFG, we don't inline anything else and the SCC structure
+; remains in tact.
+define void @test1_f() {
+; CHECK-LABEL: define void @test1_f()
+entry:
+  ; We force this edge to survive inlining.
+  call void @test1_g() noinline
+; CHECK: call void @test1_g()
+
+  ; Pull interesting CFG into this function.
+  call void @callee()
+; CHECK-NOT: call void @callee()
+
+  ret void
+; CHECK: ret void
+}
diff --git a/test/Transforms/Inline/cgscc-invalidate.ll b/test/Transforms/Inline/cgscc-invalidate.ll
index 60315cda771d..69d84f65e251 100644
--- a/test/Transforms/Inline/cgscc-invalidate.ll
+++ b/test/Transforms/Inline/cgscc-invalidate.ll
@@ -65,15 +65,15 @@ entry:
 ; The 'test3_' prefixed functions test the scenario of not inlining preserving
 ; dominators after splitting an SCC into two smaller SCCs.
 
-; The first function gets visited first and we end up inlining everything we
-; can into this routine. That splits test3_g into a separate SCC that is enqued
-; for later processing.
-define void @test3_f() {
-; CHECK-LABEL: define void @test3_f()
+; This function ends up split into a separate SCC, which can cause its analyses
+; to become stale if the splitting doesn't properly invalidate things. Also, as
+; a consequence of being split out, test3_f is too large to inline by the time
+; we get here.
+define void @test3_g() {
+; CHECK-LABEL: define void @test3_g()
 entry:
-  ; Create the first edge in the SCC cycle.
-  call void @test3_g()
-; CHECK-NOT: @test3_g()
+  ; Create the second edge in the SCC cycle.
+  call void @test3_f()
 ; CHECK: call void @test3_f()
 
   ; Pull interesting CFG into this function.
@@ -84,15 +84,15 @@ entry:
 ; CHECK: ret void
 }
 
-; This function ends up split into a separate SCC, which can cause its analyses
-; to become stale if the splitting doesn't properly invalidate things. Also, as
-; a consequence of being split out, test3_f is too large to inline by the time
-; we get here.
-define void @test3_g() {
-; CHECK-LABEL: define void @test3_g()
+; The second function gets visited first and we end up inlining everything we
+; can into this routine. That splits test3_g into a separate SCC that is enqued
+; for later processing.
+define void @test3_f() {
+; CHECK-LABEL: define void @test3_f()
 entry:
-  ; Create the second edge in the SCC cycle.
-  call void @test3_f()
+  ; Create the first edge in the SCC cycle.
+  call void @test3_g()
+; CHECK-NOT: @test3_g()
 ; CHECK: call void @test3_f()
 
   ; Pull interesting CFG into this function.
diff --git a/test/Transforms/Inline/clear-analyses.ll b/test/Transforms/Inline/clear-analyses.ll
new file mode 100644
index 000000000000..4b1d37ca29a9
--- /dev/null
+++ b/test/Transforms/Inline/clear-analyses.ll
@@ -0,0 +1,32 @@
+; Test that when a pass like correlated-propagation populates an analysis such
+; as LVI with references back into the IR of a function that the inliner will
+; delete, this doesn't crash or go awry despite the inliner clearing the analyses
+; separately from when it deletes the function.
+;
+; RUN: opt -debug-pass-manager -S < %s 2>&1 \
+; RUN:     -passes='cgscc(inline,function(correlated-propagation))' \
+; RUN:     | FileCheck %s
+;
+; CHECK-LABEL: Starting llvm::Module pass manager run.
+; CHECK: Running pass: InlinerPass on (callee)
+; CHECK: Running pass: CorrelatedValuePropagationPass on callee
+; CHECK: Running analysis: LazyValueAnalysis
+; CHECK: Running pass: InlinerPass on (caller)
+; CHECK: Clearing all analysis results for: callee
+; CHECK: Running pass: CorrelatedValuePropagationPass on caller
+; CHECK: Running analysis: LazyValueAnalysis
+
+define internal i32 @callee(i32 %x) {
+; CHECK-NOT: @callee
+entry:
+  ret i32 %x
+}
+
+define i32 @caller(i32 %x) {
+; CHECK-LABEL: define i32 @caller
+entry:
+  %call = call i32 @callee(i32 %x)
+; CHECK-NOT: call
+  ret i32 %call
+; CHECK: ret i32 %x
+}
diff --git a/test/Transforms/Inline/crash-lifetime-marker.ll b/test/Transforms/Inline/crash-lifetime-marker.ll
index e7a594cdb5e4..7196616521e9 100644
--- a/test/Transforms/Inline/crash-lifetime-marker.ll
+++ b/test/Transforms/Inline/crash-lifetime-marker.ll
@@ -15,9 +15,9 @@ define i32 @callee1(i32 %count) {
 
 ; CHECK-LABEL: define i32 @caller1(
 ; CHECK: [[ALLOCA:%[a-z0-9\.]+]] = alloca i8
-; CHECK-NOT: call void @llvm.lifetime.start(
+; CHECK-NOT: call void @llvm.lifetime.start.p0i8(
 ; CHECK: call i32 @callee2(i8* [[ALLOCA]])
-; CHECK-NOT: call void @llvm.lifetime.end(
+; CHECK-NOT: call void @llvm.lifetime.end.p0i8(
 
 define i32 @caller1(i32 %count) {
   %call0 = call i32 @callee1(i32 0)
diff --git a/test/Transforms/Inline/function-count-update-2.ll b/test/Transforms/Inline/function-count-update-2.ll
new file mode 100644
index 000000000000..702fa6292c29
--- /dev/null
+++ b/test/Transforms/Inline/function-count-update-2.ll
@@ -0,0 +1,33 @@
+; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -S | FileCheck %s
+
+; This tests that the function count of a callee gets correctly updated after it
+; has been inlined into a two callsites.
+
+; CHECK: @callee() !prof [[COUNT:![0-9]+]]
+define i32 @callee() !prof !1 {
+  ret i32 0
+}
+
+define i32 @caller1() !prof !2 {
+; CHECK-LABEL: @caller1
+; CHECK-NOT: callee
+; CHECK: ret
+  %i = call i32 @callee()
+  ret i32 %i
+}
+
+define i32 @caller2() !prof !3 {
+; CHECK-LABEL: @caller2
+; CHECK-NOT: callee
+; CHECK: ret
+  %i = call i32 @callee()
+  ret i32 %i
+}
+
+!llvm.module.flags = !{!0}
+; CHECK: [[COUNT]] = !{!"function_entry_count", i64 0}
+!0 = !{i32 1, !"MaxFunctionCount", i32 1000}
+!1 = !{!"function_entry_count", i64 1000}
+!2 = !{!"function_entry_count", i64 600}
+!3 = !{!"function_entry_count", i64 400}
+
diff --git a/test/Transforms/Inline/function-count-update-3.ll b/test/Transforms/Inline/function-count-update-3.ll
new file mode 100644
index 000000000000..215d64175faf
--- /dev/null
+++ b/test/Transforms/Inline/function-count-update-3.ll
@@ -0,0 +1,78 @@
+; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -S -inline-threshold=50 | FileCheck %s
+
+; This tests that the function count of a function gets properly scaled after 
+; inlining a call chain leading to the function.
+; Function a calls c with count 200 (C1)
+; Function c calls e with count 250 (C2)
+; Entry count of e is 500 (C3)
+; Entry count of c is 500 (C4)
+; Function b calls c with count 300 (C5)
+; c->e inlining does not happen since the cost exceeds threshold.
+; c then inlined into a.
+; e now gets inlined into a (through c) since the branch condition in e is now
+; known and hence the cost gets reduced.
+; Estimated count of a->e callsite = C2 * (C1 / C4)
+; Estimated count of a->e callsite = 250 * (200 / 500) = 100
+; Remaining count of e = C3 - 100 = 500 - 100 = 400
+; Remaining count of c = C4 - C1 - C5 = 500 - 200 - 300 = 0
+
+@data = external global i32
+
+define i32 @a(i32 %a1) !prof !1 {
+  %a2 = call i32 @c(i32 %a1, i32 1)
+  ret i32 %a2
+}
+
+define i32 @b(i32 %b1) !prof !2 {
+  %b2 = call i32 @c(i32 %b1, i32 %b1)
+  ret i32 %b2
+}
+
+declare void @ext();
+
+; CHECK: @c(i32 %c1, i32 %c100) !prof [[COUNT1:![0-9]+]]
+define i32 @c(i32 %c1, i32 %c100) !prof !3 {
+  call void @ext()
+  %cond = icmp sle i32 %c1, 1
+  br i1 %cond, label %cond_true, label %cond_false
+
+cond_false:
+  ret i32 0
+
+cond_true:
+  %c11 = call i32 @e(i32 %c100)
+  ret i32 %c11
+}
+
+
+; CHECK: @e(i32 %c1) !prof [[COUNT2:![0-9]+]]
+define i32 @e(i32 %c1) !prof !4 {
+  %cond = icmp sle i32 %c1, 1
+  br i1 %cond, label %cond_true, label %cond_false
+
+cond_false:
+  call void @ext()
+  %c2 = load i32, i32* @data, align 4
+  %c3 = add i32 %c1, %c2
+  %c4 = mul i32 %c3, %c2
+  %c5 = add i32 %c4, %c2
+  %c6 = mul i32 %c5, %c2
+  %c7 = add i32 %c6, %c2
+  %c8 = mul i32 %c7, %c2
+  %c9 = add i32 %c8, %c2
+  %c10 = mul i32 %c9, %c2
+  ret i32 %c10
+
+cond_true:
+  ret i32 0
+}
+
+!llvm.module.flags = !{!0}
+; CHECK: [[COUNT1]] = !{!"function_entry_count", i64 0}
+; CHECK: [[COUNT2]] = !{!"function_entry_count", i64 400}
+!0 = !{i32 1, !"MaxFunctionCount", i32 5000}
+!1 = !{!"function_entry_count", i64 200}
+!2 = !{!"function_entry_count", i64 300}
+!3 = !{!"function_entry_count", i64 500}
+!4 = !{!"function_entry_count", i64 500}
+
diff --git a/test/Transforms/Inline/function-count-update.ll b/test/Transforms/Inline/function-count-update.ll
new file mode 100644
index 000000000000..094ad5a2ae67
--- /dev/null
+++ b/test/Transforms/Inline/function-count-update.ll
@@ -0,0 +1,50 @@
+; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -S | FileCheck %s
+
+; This tests that the function count of two callees get correctly updated after
+; they have been inlined into two back-to-back callsites in a single basic block
+; in the caller. The callees have the alwaysinline attribute and so they get
+; inlined both with the regular inliner pass and the always inline pass. In
+; both cases, the new count of each callee is the original count minus callsite
+; count which is 200 (since the caller's entry count is 400 and the block
+; containing the calls have a relative block frequency of 0.5).
+
+; CHECK: @callee1(i32 %n) #0 !prof [[COUNT1:![0-9]+]]
+define i32 @callee1(i32 %n) #0 !prof !1 {
+  %cond = icmp sle i32 %n, 10
+  br i1 %cond, label %cond_true, label %cond_false
+
+cond_true:
+  %r1 = add i32 %n, 1
+  ret i32 %r1
+cond_false:
+  %r2 = add i32 %n, 2
+  ret i32 %r2
+}
+
+; CHECK: @callee2(i32 %n) #0 !prof [[COUNT2:![0-9]+]]
+define i32 @callee2(i32 %n) #0 !prof !2 {
+  %r1 = add i32 %n, 1
+  ret i32 %r1
+}
+
+define i32 @caller(i32 %n) !prof !3 {
+  %cond = icmp sle i32 %n, 100
+  br i1 %cond, label %cond_true, label %cond_false
+
+cond_true:
+  %i = call i32 @callee1(i32 %n)
+  %j = call i32 @callee2(i32 %i)
+  ret i32 %j
+cond_false:
+  ret i32 0
+}
+
+!llvm.module.flags = !{!0}
+; CHECK: [[COUNT1]] = !{!"function_entry_count", i64 800}
+; CHECK: [[COUNT2]] = !{!"function_entry_count", i64 1800}
+!0 = !{i32 1, !"MaxFunctionCount", i32 1000}
+!1 = !{!"function_entry_count", i64 1000}
+!2 = !{!"function_entry_count", i64 2000}
+!3 = !{!"function_entry_count", i64 400}
+attributes #0 = { alwaysinline }
+
diff --git a/test/Transforms/Inline/inline-cold-callee.ll b/test/Transforms/Inline/inline-cold-callee.ll
index 153f446c5c2e..404c537b297f 100644
--- a/test/Transforms/Inline/inline-cold-callee.ll
+++ b/test/Transforms/Inline/inline-cold-callee.ll
@@ -1,5 +1,4 @@
 ; RUN: opt < %s -inline -inlinecold-threshold=0 -S | FileCheck %s
-; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -inlinecold-threshold=0 -S | FileCheck %s
 
 ; This tests that a cold callee gets the (lower) inlinecold-threshold even without
 ; Cold hint and does not get inlined because the cost exceeds the inlinecold-threshold.
diff --git a/test/Transforms/Inline/inline-cold-callsite.ll b/test/Transforms/Inline/inline-cold-callsite.ll
new file mode 100644
index 000000000000..26ea8e50eaf1
--- /dev/null
+++ b/test/Transforms/Inline/inline-cold-callsite.ll
@@ -0,0 +1,54 @@
+; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -inline-threshold=100 -inline-cold-callsite-threshold=0 -S | FileCheck %s
+
+; This tests that a cold callsite gets the inline-cold-callsite-threshold
+; and does not get inlined. Another callsite to an identical callee that
+; is not cold gets inlined because cost is below the inline-threshold.
+
+define i32 @callee1(i32 %x) !prof !21 {
+  %x1 = add i32 %x, 1
+  %x2 = add i32 %x1, 1
+  %x3 = add i32 %x2, 1
+  call void @extern()
+  ret i32 %x3
+}
+
+define i32 @caller(i32 %n) !prof !22 {
+; CHECK-LABEL: @caller(
+  %cond = icmp sle i32 %n, 100
+  br i1 %cond, label %cond_true, label %cond_false, !prof !0
+
+cond_true:
+; CHECK-LABEL: cond_true:
+; CHECK-NOT: call i32 @callee1
+; CHECK: ret i32 %x3.i
+  %i = call i32 @callee1(i32 %n)
+  ret i32 %i
+cond_false:
+; CHECK-LABEL: cond_false:
+; CHECK: call i32 @callee1
+; CHECK: ret i32 %j
+  %j = call i32 @callee1(i32 %n)
+  ret i32 %j
+}
+declare void @extern()
+
+!0 = !{!"branch_weights", i32 200, i32 1}
+
+!llvm.module.flags = !{!1}
+!21 = !{!"function_entry_count", i64 200}
+!22 = !{!"function_entry_count", i64 200}
+
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 10000}
+!5 = !{!"MaxCount", i64 1000}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 1000}
+!8 = !{!"NumCounts", i64 3}
+!9 = !{!"NumFunctions", i64 3}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 1000, i32 1}
+!13 = !{i32 999000, i64 1000, i32 1}
+!14 = !{i32 999999, i64 1, i32 2}
diff --git a/test/Transforms/Inline/inline-hot-callsite-2.ll b/test/Transforms/Inline/inline-hot-callsite-2.ll
new file mode 100644
index 000000000000..ccfe2f0b5dec
--- /dev/null
+++ b/test/Transforms/Inline/inline-hot-callsite-2.ll
@@ -0,0 +1,56 @@
+; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -inline-threshold=0 -inlinehint-threshold=0 -hot-callsite-threshold=100 -S | FileCheck %s
+
+; This tests that a callsite which is determined to be hot based on the caller's
+; entry count and the callsite block frequency gets the hot-callsite-threshold.
+; Another callsite with the same callee that is not hot does not get inlined
+; because cost exceeds the inline-threshold. inlinthint-threshold is set to 0
+; to ensure callee's hotness is not used to boost the threshold.
+
+define i32 @callee1(i32 %x) !prof !21 {
+  %x1 = add i32 %x, 1
+  %x2 = add i32 %x1, 1
+  %x3 = add i32 %x2, 1
+  call void @extern()
+  ret i32 %x3
+}
+
+define i32 @caller(i32 %n) !prof !22 {
+; CHECK-LABEL: @caller(
+  %cond = icmp sle i32 %n, 100
+  br i1 %cond, label %cond_true, label %cond_false, !prof !0
+
+cond_true:
+; CHECK-LABEL: cond_true:
+; CHECK-NOT: call i32 @callee1
+; CHECK: ret i32 %x3.i
+  %i = call i32 @callee1(i32 %n)
+  ret i32 %i
+cond_false:
+; CHECK-LABEL: cond_false:
+; CHECK: call i32 @callee1
+; CHECK: ret i32 %j
+  %j = call i32 @callee1(i32 %n)
+  ret i32 %j
+}
+declare void @extern()
+
+!0 = !{!"branch_weights", i32 64, i32 4}
+
+!llvm.module.flags = !{!1}
+!21 = !{!"function_entry_count", i64 200}
+!22 = !{!"function_entry_count", i64 200}
+
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 10000}
+!5 = !{!"MaxCount", i64 1000}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 1000}
+!8 = !{!"NumCounts", i64 3}
+!9 = !{!"NumFunctions", i64 3}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 100, i32 1}
+!13 = !{i32 999000, i64 100, i32 1}
+!14 = !{i32 999999, i64 1, i32 2}
diff --git a/test/Transforms/Inline/inline-hot-callsite.ll b/test/Transforms/Inline/inline-hot-callsite.ll
index bdd7175b3eea..ebf4030d3d10 100644
--- a/test/Transforms/Inline/inline-hot-callsite.ll
+++ b/test/Transforms/Inline/inline-hot-callsite.ll
@@ -41,7 +41,7 @@ declare void @extern()
 
 !1 = !{i32 1, !"ProfileSummary", !2}
 !2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
-!3 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"ProfileFormat", !"SampleProfile"}
 !4 = !{!"TotalCount", i64 10000}
 !5 = !{!"MaxCount", i64 1000}
 !6 = !{!"MaxInternalCount", i64 1}
diff --git a/test/Transforms/Inline/inline_stats.ll b/test/Transforms/Inline/inline_stats.ll
index cf0d43e9215b..bc005b6afd51 100644
--- a/test/Transforms/Inline/inline_stats.ll
+++ b/test/Transforms/Inline/inline_stats.ll
@@ -36,9 +36,12 @@ define void @internal3() {
     ret void
 }
 
+declare void @external_decl()
+
 define void @external1() alwaysinline !thinlto_src_module !0 {
     call fastcc void @internal2()
     call fastcc void @external2();
+    call void @external_decl();
     ret void
 }
 
diff --git a/test/Transforms/Inline/internal-scc-members.ll b/test/Transforms/Inline/internal-scc-members.ll
new file mode 100644
index 000000000000..258ce00744c5
--- /dev/null
+++ b/test/Transforms/Inline/internal-scc-members.ll
@@ -0,0 +1,31 @@
+; Test that the inliner can handle deleting functions within an SCC while still
+; processing the calls in that SCC.
+;
+; RUN: opt < %s -S -inline | FileCheck %s
+; RUN: opt < %s -S -passes=inline | FileCheck %s
+
+; CHECK-LABEL: define internal void @test1_scc0()
+; CHECK-NOT: call
+; CHECK: call void @test1_scc0()
+; CHECK-NOT: call
+; CHECK: ret
+define internal void @test1_scc0() {
+entry:
+  call void @test1_scc1()
+  ret void
+}
+
+; CHECK-NOT: @test1_scc1
+define internal void @test1_scc1() {
+entry:
+  call void @test1_scc0()
+  ret void
+}
+
+; CHECK-LABEL: define void @test1()
+; CHECK: call void @test1_scc0()
+define void @test1() {
+entry:
+  call void @test1_scc0() noinline
+  ret void
+}
diff --git a/test/Transforms/Inline/last-call-bonus.ll b/test/Transforms/Inline/last-call-bonus.ll
new file mode 100644
index 000000000000..0088d316848f
--- /dev/null
+++ b/test/Transforms/Inline/last-call-bonus.ll
@@ -0,0 +1,52 @@
+; The goal of this test is checking if LastCallToStaticBonus is applied
+; correctly while deciding inline deferral. For the test code below, when
+; inliner evaluates the callsite of bar->baz, it checks if inlining of bar->baz
+; prevents ininling of foo->bar, even when foo->bar inlining is more beneficial
+; than bar->baz inlining. As LastCallToStaticBonus has a massive value, and
+; both baz and bar has only one caller, the cost of foo->bar inlining and
+; bar->baz inlining should be non-trivial for inliner to compute that bar->baz
+; inlining can actaully prevent foo->bar inlining. To make the cost of these
+; callsites big enough, loop unrolling pass with very high threshold is used to
+; preprocess the test.
+
+; RUN: opt < %s -loop-unroll -inline -unroll-threshold=15000 -inline-threshold=250 -S | FileCheck %s
+; CHECK-LABEL: define internal i32 @bar()
+
+define internal i32 @baz() {
+entry:
+  br label %bb1
+
+bb1:
+  %ind = phi i32 [ 0, %entry ], [ %inc, %bb1 ]
+  call void @extern()
+  %inc = add nsw i32 %ind, 1
+  %cmp = icmp sgt i32 %inc, 510
+  br i1 %cmp, label %ret, label %bb1
+
+ret:
+  ret i32 0
+}
+
+define internal i32 @bar() {
+entry:
+  br label %bb1
+
+bb1:
+  %ind = phi i32 [ 0, %entry ], [ %inc, %bb1 ]
+  call void @extern()
+  %inc = add nsw i32 %ind, 1
+  %cmp = icmp sgt i32 %inc, 510
+  br i1 %cmp, label %ret, label %bb1
+
+ret:
+  call i32 @baz()
+  ret i32 0
+}
+
+define i32 @foo() {
+entry:
+  call i32 @bar()
+  ret i32 0
+}
+
+declare void @extern()
diff --git a/test/Transforms/Inline/lifetime-no-datalayout.ll b/test/Transforms/Inline/lifetime-no-datalayout.ll
index 0212e69d624a..5d1872c6a244 100644
--- a/test/Transforms/Inline/lifetime-no-datalayout.ll
+++ b/test/Transforms/Inline/lifetime-no-datalayout.ll
@@ -13,9 +13,9 @@ define void @helper() {
 define void @test() {
 ; CHECK-LABEL: @test(
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.start(i64 1
+; CHECK: llvm.lifetime.start.p0i8(i64 1
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.end(i64 1
+; CHECK: llvm.lifetime.end.p0i8(i64 1
   call void @helper()
 ; CHECK-NOT: lifetime
 ; CHECK: ret void
diff --git a/test/Transforms/Inline/lifetime.ll b/test/Transforms/Inline/lifetime.ll
index 4f415e58f1bf..c47091395fce 100644
--- a/test/Transforms/Inline/lifetime.ll
+++ b/test/Transforms/Inline/lifetime.ll
@@ -2,25 +2,25 @@
 ; RUN: opt -passes='cgscc(inline)' -S < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
-declare void @llvm.lifetime.start(i64, i8*)
-declare void @llvm.lifetime.end(i64, i8*)
+declare void @llvm.lifetime.start.p0i8(i64, i8*)
+declare void @llvm.lifetime.end.p0i8(i64, i8*)
 
 define void @helper_both_markers() {
   %a = alloca i8
   ; Size in llvm.lifetime.start / llvm.lifetime.end differs from
   ; allocation size. We should use the former.
-  call void @llvm.lifetime.start(i64 2, i8* %a)
-  call void @llvm.lifetime.end(i64 2, i8* %a)
+  call void @llvm.lifetime.start.p0i8(i64 2, i8* %a)
+  call void @llvm.lifetime.end.p0i8(i64 2, i8* %a)
   ret void
 }
 
 define void @test_both_markers() {
 ; CHECK-LABEL: @test_both_markers(
-; CHECK: llvm.lifetime.start(i64 2
-; CHECK-NEXT: llvm.lifetime.end(i64 2
+; CHECK: llvm.lifetime.start.p0i8(i64 2
+; CHECK-NEXT: llvm.lifetime.end.p0i8(i64 2
   call void @helper_both_markers()
-; CHECK-NEXT: llvm.lifetime.start(i64 2
-; CHECK-NEXT: llvm.lifetime.end(i64 2
+; CHECK-NEXT: llvm.lifetime.start.p0i8(i64 2
+; CHECK-NEXT: llvm.lifetime.end.p0i8(i64 2
   call void @helper_both_markers()
 ; CHECK-NEXT: ret void
   ret void
@@ -41,14 +41,14 @@ define void @helper_no_markers() {
 define void @test_no_marker() {
 ; CHECK-LABEL: @test_no_marker(
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.start(i64 1
+; CHECK: llvm.lifetime.start.p0i8(i64 1
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.end(i64 1
+; CHECK: llvm.lifetime.end.p0i8(i64 1
   call void @helper_no_markers()
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.start(i64 1
+; CHECK: llvm.lifetime.start.p0i8(i64 1
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.end(i64 1
+; CHECK: llvm.lifetime.end.p0i8(i64 1
   call void @helper_no_markers()
 ; CHECK-NOT: lifetime
 ; CHECK: ret void
@@ -58,23 +58,23 @@ define void @test_no_marker() {
 define void @helper_two_casts() {
   %a = alloca i32
   %b = bitcast i32* %a to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %b)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %b)
   %c = bitcast i32* %a to i8*
-  call void @llvm.lifetime.end(i64 4, i8* %c)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %c)
   ret void
 }
 
 define void @test_two_casts() {
 ; CHECK-LABEL: @test_two_casts(
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.start(i64 4
+; CHECK: llvm.lifetime.start.p0i8(i64 4
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.end(i64 4
+; CHECK: llvm.lifetime.end.p0i8(i64 4
   call void @helper_two_casts()
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.start(i64 4
+; CHECK: llvm.lifetime.start.p0i8(i64 4
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.end(i64 4
+; CHECK: llvm.lifetime.end.p0i8(i64 4
   call void @helper_two_casts()
 ; CHECK-NOT: lifetime
 ; CHECK: ret void
@@ -91,9 +91,9 @@ define void @helper_arrays_alloca() {
 define void @test_arrays_alloca() {
 ; CHECK-LABEL: @test_arrays_alloca(
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.start(i64 40,
+; CHECK: llvm.lifetime.start.p0i8(i64 40,
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.end(i64 40,
+; CHECK: llvm.lifetime.end.p0i8(i64 40,
   call void @helper_arrays_alloca()
 ; CHECK-NOT: lifetime
 ; CHECK: ret void
diff --git a/test/Transforms/Inline/monster_scc.ll b/test/Transforms/Inline/monster_scc.ll
new file mode 100644
index 000000000000..0f8f1f21c8b5
--- /dev/null
+++ b/test/Transforms/Inline/monster_scc.ll
@@ -0,0 +1,460 @@
+; This test creates a monster SCC with a very pernicious call graph. It builds
+; a cycle of cross-connected pairs of functions with interesting inlining
+; decisions throughout, but ultimately trivial code complexity.
+;
+; Typically, a greedy approach to inlining works well for bottom-up inliners
+; such as LLVM's. However, there is no way to be bottom-up over an SCC: it's
+; a cycle! Greedily inlining as much as possible into each function of this
+; *SCC* will have the disasterous effect of inlining all N-1 functions into the
+; first one visited, N-2 functions into the second one visited, N-3 into the
+; third, and so on. This is because until inlining occurs, each function in
+; isolation appears to be an excellent inline candidate.
+;
+; Note that the exact number of calls in each function doesn't really matter.
+; It is mostly a function of cost thresholds and visit order. Because this is an
+; SCC there is no "right" or "wrong" answer here as long as no function blows up
+; to be *huge*. The specific concerning pattern is if one or more functions get
+; more than 16 calls in them.
+;
+; This test is extracted from the following C++ program compiled with Clang.
+; The IR is simplified with SROA, instcombine, and simplify-cfg. Then C++
+; linkage stuff, attributes, target specific things, metadata and comments were
+; removed. The order of the fuctions is also made more predictable than Clang's
+; output order.
+;
+;   void g(int);
+;
+;   template <bool K, int N> void f(bool *B, bool *E) {
+;     if (K)
+;       g(N);
+;     if (B == E)
+;       return;
+;     if (*B)
+;       f<true, N + 1>(B + 1, E);
+;     else
+;       f<false, N + 1>(B + 1, E);
+;   }
+;   template <> void f<false, MAX>(bool *B, bool *E) { return f<false, 0>(B, E); }
+;   template <> void f<true, MAX>(bool *B, bool *E) { return f<true, 0>(B, E); }
+;
+;   void test(bool *B, bool *E) { f<false, 0>(B, E); }
+;
+; RUN: opt -S < %s -inline -inline-threshold=150 | FileCheck %s --check-prefixes=CHECK,OLD
+; RUN: opt -S < %s -passes=inline -inline-threshold=150 | FileCheck %s --check-prefixes=CHECK,NEW
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare void @_Z1gi(i32)
+
+; CHECK-LABEL: define void @_Z1fILb0ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1gi(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb1ELi2EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi2EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi1EEvPbS0_(
+; OLD-NOT: call
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi2EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi2EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi2EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi2EEvPbS0_(
+; NEW-NOT: call
+define void @_Z1fILb0ELi0EEvPbS0_(i8* %B, i8* %E) {
+entry:
+  %cmp = icmp eq i8* %B, %E
+  br i1 %cmp, label %if.end3, label %if.end
+
+if.end:
+  %0 = load i8, i8* %B, align 1
+  %tobool = icmp eq i8 %0, 0
+  %add.ptr2 = getelementptr inbounds i8, i8* %B, i64 1
+  br i1 %tobool, label %if.else, label %if.then1
+
+if.then1:
+  call void @_Z1fILb1ELi1EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.else:
+  call void @_Z1fILb0ELi1EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.end3:
+  ret void
+}
+
+; CHECK-LABEL: define void @_Z1fILb1ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1gi(
+; OLD-NOT: call
+; OLD: call void @_Z1gi(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb1ELi2EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi2EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi1EEvPbS0_(
+; OLD-NOT: call
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi1EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi2EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi2EEvPbS0_(
+; NEW-NOT: call
+define void @_Z1fILb1ELi0EEvPbS0_(i8* %B, i8* %E) {
+entry:
+  call void @_Z1gi(i32 0)
+  %cmp = icmp eq i8* %B, %E
+  br i1 %cmp, label %if.end3, label %if.end
+
+if.end:
+  %0 = load i8, i8* %B, align 1
+  %tobool = icmp eq i8 %0, 0
+  %add.ptr2 = getelementptr inbounds i8, i8* %B, i64 1
+  br i1 %tobool, label %if.else, label %if.then1
+
+if.then1:
+  call void @_Z1fILb1ELi1EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.else:
+  call void @_Z1fILb0ELi1EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.end3:
+  ret void
+}
+
+; CHECK-LABEL: define void @_Z1fILb0ELi1EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1gi(
+; OLD-NOT: call
+; OLD: call void @_Z1gi(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb1ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb1ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi2EEvPbS0_(
+; OLD-NOT: call
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi2EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi0EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi3EEvPbS0_(
+; NEW-NOT: call
+define void @_Z1fILb0ELi1EEvPbS0_(i8* %B, i8* %E) {
+entry:
+  %cmp = icmp eq i8* %B, %E
+  br i1 %cmp, label %if.end3, label %if.end
+
+if.end:
+  %0 = load i8, i8* %B, align 1
+  %tobool = icmp eq i8 %0, 0
+  %add.ptr2 = getelementptr inbounds i8, i8* %B, i64 1
+  br i1 %tobool, label %if.else, label %if.then1
+
+if.then1:
+  call void @_Z1fILb1ELi2EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.else:
+  call void @_Z1fILb0ELi2EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.end3:
+  ret void
+}
+
+; CHECK-LABEL: define void @_Z1fILb1ELi1EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1gi(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb1ELi2EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi2EEvPbS0_(
+; OLD-NOT: call
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi0EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi3EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi0EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi3EEvPbS0_(
+; NEW-NOT: call
+define void @_Z1fILb1ELi1EEvPbS0_(i8* %B, i8* %E) {
+entry:
+  call void @_Z1gi(i32 1)
+  %cmp = icmp eq i8* %B, %E
+; CHECK-NOT: call
+  br i1 %cmp, label %if.end3, label %if.end
+
+if.end:
+  %0 = load i8, i8* %B, align 1
+  %tobool = icmp eq i8 %0, 0
+  %add.ptr2 = getelementptr inbounds i8, i8* %B, i64 1
+  br i1 %tobool, label %if.else, label %if.then1
+
+if.then1:
+  call void @_Z1fILb1ELi2EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.else:
+  call void @_Z1fILb0ELi2EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.end3:
+  ret void
+}
+
+; CHECK-LABEL: define void @_Z1fILb0ELi2EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1gi(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb1ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb1ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi0EEvPbS0_(
+; OLD-NOT: call
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi0EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi4EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW-NOT: call
+define void @_Z1fILb0ELi2EEvPbS0_(i8* %B, i8* %E) {
+entry:
+  %cmp = icmp eq i8* %B, %E
+  br i1 %cmp, label %if.end3, label %if.end
+
+if.end:
+  %0 = load i8, i8* %B, align 1
+  %tobool = icmp eq i8 %0, 0
+  %add.ptr2 = getelementptr inbounds i8, i8* %B, i64 1
+  br i1 %tobool, label %if.else, label %if.then1
+
+if.then1:
+  call void @_Z1fILb1ELi3EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.else:
+  call void @_Z1fILb0ELi3EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.end3:
+  ret void
+}
+
+; CHECK-LABEL: define void @_Z1fILb1ELi2EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1gi(
+; OLD-NOT: call
+; OLD: call void @_Z1gi(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb1ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb1ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi0EEvPbS0_(
+; OLD-NOT: call
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi1EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi1EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi1EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi1EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW-NOT: call
+define void @_Z1fILb1ELi2EEvPbS0_(i8* %B, i8* %E) {
+entry:
+  call void @_Z1gi(i32 2)
+  %cmp = icmp eq i8* %B, %E
+  br i1 %cmp, label %if.end3, label %if.end
+
+if.end:
+  %0 = load i8, i8* %B, align 1
+  %tobool = icmp eq i8 %0, 0
+  %add.ptr2 = getelementptr inbounds i8, i8* %B, i64 1
+  br i1 %tobool, label %if.else, label %if.then1
+
+if.then1:
+  call void @_Z1fILb1ELi3EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.else:
+  call void @_Z1fILb0ELi3EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.end3:
+  ret void
+}
+
+; CHECK-LABEL: define void @_Z1fILb0ELi3EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb1ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi0EEvPbS0_(
+; OLD-NOT: call
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi1EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi1EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW-NOT: call
+define void @_Z1fILb0ELi3EEvPbS0_(i8* %B, i8* %E) {
+entry:
+  %cmp = icmp eq i8* %B, %E
+  br i1 %cmp, label %if.end3, label %if.end
+
+if.end:
+  %0 = load i8, i8* %B, align 1
+  %tobool = icmp eq i8 %0, 0
+  %add.ptr2 = getelementptr inbounds i8, i8* %B, i64 1
+  br i1 %tobool, label %if.else, label %if.then1
+
+if.then1:
+  call void @_Z1fILb1ELi4EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.else:
+  call void @_Z1fILb0ELi4EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.end3:
+  ret void
+}
+
+; CHECK-LABEL: define void @_Z1fILb1ELi3EEvPbS0_(
+; CHECK-NOT: call
+; CHECK: call void @_Z1gi(
+; CHECK-NOT: call
+; CHECK: call void @_Z1fILb1ELi0EEvPbS0_(
+; CHECK-NOT: call
+; CHECK: call void @_Z1fILb0ELi0EEvPbS0_(
+; CHECK-NOT: call
+define void @_Z1fILb1ELi3EEvPbS0_(i8* %B, i8* %E) {
+entry:
+  call void @_Z1gi(i32 3)
+  %cmp = icmp eq i8* %B, %E
+  br i1 %cmp, label %if.end3, label %if.end
+
+if.end:
+  %0 = load i8, i8* %B, align 1
+  %tobool = icmp eq i8 %0, 0
+  %add.ptr2 = getelementptr inbounds i8, i8* %B, i64 1
+  br i1 %tobool, label %if.else, label %if.then1
+
+if.then1:
+  call void @_Z1fILb1ELi4EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.else:
+  call void @_Z1fILb0ELi4EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.end3:
+  ret void
+}
+
+; CHECK-LABEL: define void @_Z1fILb0ELi4EEvPbS0_(
+; CHECK-NOT: call
+; CHECK: call void @_Z1fILb0ELi0EEvPbS0_(
+; CHECK-NOT: call
+define void @_Z1fILb0ELi4EEvPbS0_(i8* %B, i8* %E) {
+entry:
+  call void @_Z1fILb0ELi0EEvPbS0_(i8* %B, i8* %E)
+  ret void
+}
+
+; CHECK-LABEL: define void @_Z1fILb1ELi4EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb1ELi0EEvPbS0_(
+; OLD-NOT: call
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi1EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi2EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi0EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi3EEvPbS0_(
+; NEW-NOT: call
+define void @_Z1fILb1ELi4EEvPbS0_(i8* %B, i8* %E) {
+entry:
+  call void @_Z1fILb1ELi0EEvPbS0_(i8* %B, i8* %E)
+  ret void
+}
+
+; CHECK-LABEL: define void @_Z4testPbS_(
+; CHECK: call
+; CHECK-NOT: call
+define void @_Z4testPbS_(i8* %B, i8* %E) {
+entry:
+  call void @_Z1fILb0ELi0EEvPbS0_(i8* %B, i8* %E)
+  ret void
+}
+
diff --git a/test/Transforms/Inline/optimization-remarks-with-hotness.ll b/test/Transforms/Inline/optimization-remarks-with-hotness.ll
index 9611a2dd1bd4..1d6d135bdda8 100644
--- a/test/Transforms/Inline/optimization-remarks-with-hotness.ll
+++ b/test/Transforms/Inline/optimization-remarks-with-hotness.ll
@@ -4,8 +4,7 @@
 
 ; CHECK: foo should always be inlined (cost=always) (hotness: 30)
 ; CHECK: foo inlined into bar (hotness: 30)
-; CHECK: foz should never be inlined (cost=never) (hotness: 30)
-; CHECK: foz will not be inlined into bar (hotness: 30)
+; CHECK: foz not inlined into bar because it should never be inlined (cost=never) (hotness: 30)
 
 ; Function Attrs: alwaysinline nounwind uwtable
 define i32 @foo() #0 !prof !1 {
diff --git a/test/Transforms/Inline/optimization-remarks.ll b/test/Transforms/Inline/optimization-remarks.ll
index 59cf08327350..61e270cff76c 100644
--- a/test/Transforms/Inline/optimization-remarks.ll
+++ b/test/Transforms/Inline/optimization-remarks.ll
@@ -9,8 +9,7 @@
 ; NO_HOTNESS-NOT: fox will not be inlined into bar because its definition is unavailable
 ; CHECK: foo should always be inlined (cost=always)
 ; CHECK: foo inlined into bar
-; CHECK: foz should never be inlined (cost=never)
-; CHECK: foz will not be inlined into bar
+; CHECK: foz not inlined into bar because it should never be inlined (cost=never)
 
 ; Function Attrs: alwaysinline nounwind uwtable
 define i32 @foo(i32 %x, i32 %y) #0 {
diff --git a/test/Transforms/Inline/prof-update.ll b/test/Transforms/Inline/prof-update.ll
new file mode 100644
index 000000000000..38fcc7e45996
--- /dev/null
+++ b/test/Transforms/Inline/prof-update.ll
@@ -0,0 +1,39 @@
+; RUN: opt < %s -inline -S | FileCheck %s
+; Checks if inliner updates branch_weights annotation for call instructions.
+
+declare void @ext();
+declare void @ext1();
+
+; CHECK: define void @callee(i32 %n) !prof ![[ENTRY_COUNT:[0-9]*]]
+define void  @callee(i32 %n) !prof !1 {
+  %cond = icmp sle i32 %n, 10
+  br i1 %cond, label %cond_true, label %cond_false
+cond_true:
+; ext1 is optimized away, thus not updated.
+; CHECK: call void @ext1(), !prof ![[COUNT_CALLEE1:[0-9]*]]
+  call void @ext1(), !prof !2
+  ret void
+cond_false:
+; ext is cloned and updated.
+; CHECK: call void @ext(), !prof ![[COUNT_CALLEE:[0-9]*]]
+  call void @ext(), !prof !2
+  ret void
+}
+
+; CHECK: define void @caller()
+define void @caller() {
+; CHECK: call void @ext(), !prof ![[COUNT_CALLER:[0-9]*]]
+  call void @callee(i32 15), !prof !3
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"MaxFunctionCount", i32 2000}
+!1 = !{!"function_entry_count", i64 1000}
+!2 = !{!"branch_weights", i64 2000}
+!3 = !{!"branch_weights", i64 400}
+attributes #0 = { alwaysinline }
+; CHECK: ![[ENTRY_COUNT]] = !{!"function_entry_count", i64 600}
+; CHECK: ![[COUNT_CALLEE1]] = !{!"branch_weights", i64 2000}
+; CHECK: ![[COUNT_CALLEE]] = !{!"branch_weights", i32 1200}
+; CHECK: ![[COUNT_CALLER]] = !{!"branch_weights", i32 800}
diff --git a/test/Transforms/InstCombine/2008-01-29-AddICmp.ll b/test/Transforms/InstCombine/2008-01-29-AddICmp.ll
deleted file mode 100644
index a33eb9c1ddd4..000000000000
--- a/test/Transforms/InstCombine/2008-01-29-AddICmp.ll
+++ /dev/null
@@ -1,85 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-; PR1949
-
-define i1 @test1(i32 %a) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[C:%.*]] = icmp ugt i32 %a, -5
-; CHECK-NEXT:    ret i1 [[C]]
-;
-  %b = add i32 %a, 4
-  %c = icmp ult i32 %b, 4
-  ret i1 %c
-}
-
-define <2 x i1> @test1vec(<2 x i32> %a) {
-; CHECK-LABEL: @test1vec(
-; CHECK-NEXT:    [[C:%.*]] = icmp ugt <2 x i32> %a, <i32 -5, i32 -5>
-; CHECK-NEXT:    ret <2 x i1> [[C]]
-;
-  %b = add <2 x i32> %a, <i32 4, i32 4>
-  %c = icmp ult <2 x i32> %b, <i32 4, i32 4>
-  ret <2 x i1> %c
-}
-
-define i1 @test2(i32 %a) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 %a, 4
-; CHECK-NEXT:    ret i1 [[C]]
-;
-  %b = sub i32 %a, 4
-  %c = icmp ugt i32 %b, -5
-  ret i1 %c
-}
-
-define <2 x i1> @test2vec(<2 x i32> %a) {
-; CHECK-LABEL: @test2vec(
-; CHECK-NEXT:    [[C:%.*]] = icmp ult <2 x i32> %a, <i32 4, i32 4>
-; CHECK-NEXT:    ret <2 x i1> [[C]]
-;
-  %b = sub <2 x i32> %a, <i32 4, i32 4>
-  %c = icmp ugt <2 x i32> %b, <i32 -5, i32 -5>
-  ret <2 x i1> %c
-}
-
-define i1 @test3(i32 %a) {
-; CHECK-LABEL: @test3(
-; CHECK-NEXT:    [[C:%.*]] = icmp sgt i32 %a, 2147483643
-; CHECK-NEXT:    ret i1 [[C]]
-;
-  %b = add i32 %a, 4
-  %c = icmp slt i32 %b, 2147483652
-  ret i1 %c
-}
-
-define <2 x i1> @test3vec(<2 x i32> %a) {
-; CHECK-LABEL: @test3vec(
-; CHECK-NEXT:    [[C:%.*]] = icmp sgt <2 x i32> %a, <i32 2147483643, i32 2147483643>
-; CHECK-NEXT:    ret <2 x i1> [[C]]
-;
-  %b = add <2 x i32> %a, <i32 4, i32 4>
-  %c = icmp slt <2 x i32> %b, <i32 2147483652, i32 2147483652>
-  ret <2 x i1> %c
-}
-
-define i1 @test4(i32 %a) {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 %a, -4
-; CHECK-NEXT:    ret i1 [[C]]
-;
-  %b = add i32 %a, 2147483652
-  %c = icmp sge i32 %b, 4
-  ret i1 %c
-}
-
-define <2 x i1> @test4vec(<2 x i32> %a) {
-; CHECK-LABEL: @test4vec(
-; CHECK-NEXT:    [[C:%.*]] = icmp slt <2 x i32> %a, <i32 -4, i32 -4>
-; CHECK-NEXT:    ret <2 x i1> [[C]]
-;
-  %b = add <2 x i32> %a, <i32 2147483652, i32 2147483652>
-  %c = icmp sge <2 x i32> %b, <i32 4, i32 4>
-  ret <2 x i1> %c
-}
-
diff --git a/test/Transforms/InstCombine/2008-05-22-NegValVector.ll b/test/Transforms/InstCombine/2008-05-22-NegValVector.ll
index bf92faf2fec5..58259be8bc92 100644
--- a/test/Transforms/InstCombine/2008-05-22-NegValVector.ll
+++ b/test/Transforms/InstCombine/2008-05-22-NegValVector.ll
@@ -6,3 +6,9 @@ define <3 x i8> @f(<3 x i8> %a) {
   ret <3 x i8> %B
 }
 
+define <3 x i4> @g(<3 x i4> %a) {
+  %A = sub <3 x i4> zeroinitializer, %a
+  %B = mul <3 x i4> %A, <i4 5, i4 5, i4 5>
+  ret <3 x i4> %B
+}
+
diff --git a/test/Transforms/InstCombine/2008-11-20-DivMulRem.ll b/test/Transforms/InstCombine/2008-11-20-DivMulRem.ll
deleted file mode 100644
index 0c0e55a0b2d9..000000000000
--- a/test/Transforms/InstCombine/2008-11-20-DivMulRem.ll
+++ /dev/null
@@ -1,67 +0,0 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
-; PR3103
-
-define i8 @test1(i8 %x, i8 %y) {
-; CHECK-LABEL: @test1(
-  %A = udiv i8 %x, %y
-; CHECK-NEXT: urem
-  %B = mul i8 %A, %y
-  %C = sub i8 %x, %B
-  ret i8 %C
-; CHECK-NEXT: ret
-}
-
-define i8 @test2(i8 %x, i8 %y) {
-; CHECK-LABEL: @test2(
-  %A = sdiv i8 %x, %y
-; CHECK-NEXT: srem
-  %B = mul i8 %A, %y
-  %C = sub i8 %x, %B
-  ret i8 %C
-; CHECK-NEXT: ret
-}
-
-define i8 @test3(i8 %x, i8 %y) {
-; CHECK-LABEL: @test3(
-  %A = udiv i8 %x, %y
-; CHECK-NEXT: urem
-  %B = mul i8 %A, %y
-  %C = sub i8 %B, %x
-; CHECK-NEXT: sub
-  ret i8 %C
-; CHECK-NEXT: ret
-}
-
-define i8 @test4(i8 %x) {
-; CHECK-LABEL: @test4(
-  %A = udiv i8 %x, 3
-; CHECK-NEXT: urem
-  %B = mul i8 %A, -3
-; CHECK-NEXT: sub
-  %C = sub i8 %x, %B
-; CHECK-NEXT: add
-  ret i8 %C
-; CHECK-NEXT: ret
-}
-
-define i32 @test5(i32 %x, i32 %y) {
-; CHECK-LABEL: @test5(
-; (((X / Y) * Y) / Y) -> X / Y
-  %div = sdiv i32 %x, %y
-; CHECK-NEXT: sdiv
-  %mul = mul i32 %div, %y
-  %r = sdiv i32 %mul, %y
-  ret i32 %r
-; CHECK-NEXT: ret
-}
-
-define i32 @test6(i32 %x, i32 %y) {
-; CHECK-LABEL: @test6(
-; (((X / Y) * Y) / Y) -> X / Y
-  %div = udiv i32 %x, %y
-; CHECK-NEXT: udiv
-  %mul = mul i32 %div, %y
-  %r = udiv i32 %mul, %y
-  ret i32 %r
-; CHECK-NEXT: ret
-}
diff --git a/test/Transforms/InstCombine/2009-03-20-AShrOverShift.ll b/test/Transforms/InstCombine/2009-03-20-AShrOverShift.ll
deleted file mode 100644
index 4d4797720c53..000000000000
--- a/test/Transforms/InstCombine/2009-03-20-AShrOverShift.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: opt < %s -instcombine -S | grep "ashr i32 %val, 31"
-; PR3851
-
-define i32 @foo2(i32 %val) nounwind {
-entry:
-	%shr = ashr i32 %val, 15		; <i32> [#uses=3]
-	%shr4 = ashr i32 %shr, 17		; <i32> [#uses=1]
-        ret i32 %shr4
- }
diff --git a/test/Transforms/InstCombine/2012-07-25-LoadPart.ll b/test/Transforms/InstCombine/2012-07-25-LoadPart.ll
index 14fcf52fe9a7..71255ebbf81f 100644
--- a/test/Transforms/InstCombine/2012-07-25-LoadPart.ll
+++ b/test/Transforms/InstCombine/2012-07-25-LoadPart.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -default-data-layout="e-p:32:32:32" -instcombine -S | FileCheck %s --check-prefix=LE
-; RUN: opt < %s -default-data-layout="E-p:32:32:32" -instcombine -S | FileCheck %s --check-prefix=BE
+; RUN: opt < %s -data-layout="e-p:32:32:32" -instcombine -S | FileCheck %s --check-prefix=LE
+; RUN: opt < %s -data-layout="E-p:32:32:32" -instcombine -S | FileCheck %s --check-prefix=BE
 ; PR13442
 
 @test = constant [4 x i32] [i32 1, i32 2, i32 3, i32 4]
diff --git a/test/Transforms/InstCombine/X86FsubCmpCombine.ll b/test/Transforms/InstCombine/X86FsubCmpCombine.ll
new file mode 100644
index 000000000000..fde0692d00a2
--- /dev/null
+++ b/test/Transforms/InstCombine/X86FsubCmpCombine.ll
@@ -0,0 +1,181 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; The test checks the folding of cmp(sub(a,b),0) into cmp(a,b).
+
+define i8 @sub_compare_foldingPD128_safe(<2 x double> %a, <2 x double> %b){
+; CHECK-LABEL: @sub_compare_foldingPD128_safe(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB_SAFE:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[SUB_SAFE]], <2 x double> zeroinitializer, i32 5, i8 -1)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %sub.safe = fsub <2 x double> %a, %b
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %sub.safe , <2 x double> zeroinitializer, i32 5, i8 -1)
+  ret i8 %0
+}
+
+
+define i8 @sub_compare_foldingPD128(<2 x double> %a, <2 x double> %b){
+; CHECK-LABEL: @sub_compare_foldingPD128(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 5, i8 -1)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %sub.i = fsub ninf <2 x double> %a, %b
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %sub.i , <2 x double> zeroinitializer, i32 5, i8 -1)
+  ret i8 %0
+}
+
+
+define i8 @sub_compare_foldingPD256(<4 x double> %a, <4 x double> %b){
+; CHECK-LABEL: @sub_compare_foldingPD256(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> [[A:%.*]], <4 x double> [[B:%.*]], i32 5, i8 -1)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %sub.i1 = fsub ninf <4 x double> %a, %b
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %sub.i1, <4 x double> zeroinitializer, i32 5, i8 -1)
+  ret i8 %0
+}
+
+
+define i8 @sub_compare_foldingPD512(<8 x double> %a, <8 x double> %b){
+; CHECK-LABEL: @sub_compare_foldingPD512(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 11, i8 -1, i32 4)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %sub.i2 = fsub ninf <8 x double> %a, %b
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %sub.i2, <8 x double> zeroinitializer, i32 11, i8 -1, i32 4)
+  ret i8 %0
+}
+
+
+define i8 @sub_compare_foldingPS128(<4 x float> %a, <4 x float> %b){
+; CHECK-LABEL: @sub_compare_foldingPS128(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 12, i8 -1)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %sub.i3 = fsub ninf <4 x float> %a, %b
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %sub.i3, <4 x float> zeroinitializer, i32 12, i8 -1)
+  ret i8 %0
+}
+
+
+define i8 @sub_compare_foldingPS256(<8 x float> %a, <8 x float> %b){
+; CHECK-LABEL: @sub_compare_foldingPS256(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> [[A:%.*]], <8 x float> [[B:%.*]], i32 5, i8 -1)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %sub.i4 = fsub ninf <8 x float> %a, %b
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %sub.i4, <8 x float> zeroinitializer, i32 5, i8 -1)
+  ret i8 %0
+}
+
+
+define i16 @sub_compare_foldingPS512(<16 x float> %a, <16 x float> %b){
+; CHECK-LABEL: @sub_compare_foldingPS512(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 11, i16 -1, i32 4)
+; CHECK-NEXT:    ret i16 [[TMP0]]
+;
+entry:
+  %sub.i5 = fsub ninf <16 x float> %a, %b
+  %0 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %sub.i5, <16 x float> zeroinitializer, i32 11, i16 -1, i32 4)
+  ret i16 %0
+}
+
+
+
+define i8 @sub_compare_folding_swapPD128(<2 x double> %a, <2 x double> %b){
+; CHECK-LABEL: @sub_compare_folding_swapPD128(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[B:%.*]], <2 x double> [[A:%.*]], i32 5, i8 -1)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %sub.i = fsub ninf <2 x double> %a, %b
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> zeroinitializer, <2 x double> %sub.i, i32 5, i8 -1)
+  ret i8 %0
+}
+
+
+define i8 @sub_compare_folding_swapPD256(<4 x double> %a, <4 x double> %b){
+; CHECK-LABEL: @sub_compare_folding_swapPD256(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> [[B:%.*]], <4 x double> [[A:%.*]], i32 5, i8 -1)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %sub.i = fsub ninf <4 x double> %a, %b
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> %sub.i, i32 5, i8 -1)
+  ret i8 %0
+}
+
+
+define i8 @sub_compare_folding_swapPD512(<8 x double> %a, <8 x double> %b){
+; CHECK-LABEL: @sub_compare_folding_swapPD512(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[B:%.*]], <8 x double> [[A:%.*]], i32 11, i8 -1, i32 4)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %sub.i = fsub ninf <8 x double> %a, %b
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> zeroinitializer, <8 x double> %sub.i, i32 11, i8 -1, i32 4)
+  ret i8 %0
+}
+
+
+define i8 @sub_compare_folding_swapPS128(<4 x float> %a, <4 x float> %b){
+; CHECK-LABEL: @sub_compare_folding_swapPS128(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> [[B:%.*]], <4 x float> [[A:%.*]], i32 12, i8 -1)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %sub.i = fsub ninf <4 x float> %a, %b
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> zeroinitializer, <4 x float> %sub.i, i32 12, i8 -1)
+  ret i8 %0
+}
+
+
+define i8 @sub_compare_folding_swapPS256(<8 x float> %a, <8 x float> %b){
+; CHECK-LABEL: @sub_compare_folding_swapPS256(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> [[B:%.*]], <8 x float> [[A:%.*]], i32 5, i8 -1)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %sub.i = fsub ninf <8 x float> %a, %b
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> %sub.i, i32 5, i8 -1)
+  ret i8 %0
+}
+
+
+define i16 @sub_compare_folding_swapPS512(<16 x float> %a, <16 x float> %b){
+; CHECK-LABEL: @sub_compare_folding_swapPS512(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[B:%.*]], <16 x float> [[A:%.*]], i32 11, i16 -1, i32 4)
+; CHECK-NEXT:    ret i16 [[TMP0]]
+;
+entry:
+  %sub.i = fsub ninf <16 x float> %a, %b
+  %0 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> zeroinitializer, <16 x float> %sub.i, i32 11, i16 -1, i32 4)
+  ret i16 %0
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32, i8)
+declare i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32, i8)
+declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, i8, i32)
+declare i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32, i8)
+declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, i8)
+declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i16, i32)
diff --git a/test/Transforms/InstCombine/add-sitofp.ll b/test/Transforms/InstCombine/add-sitofp.ll
index 3b5485e00528..2abfa436f6d3 100644
--- a/test/Transforms/InstCombine/add-sitofp.ll
+++ b/test/Transforms/InstCombine/add-sitofp.ll
@@ -1,6 +1,14 @@
-; RUN: opt < %s -instcombine -S | grep "add nuw nsw i32"
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
 
-define double @x(i32 %a, i32 %b) nounwind {
+define double @x(i32 %a, i32 %b) {
+; CHECK-LABEL: @x(
+; CHECK-NEXT:    [[M:%.*]] = lshr i32 [[A:%.*]], 24
+; CHECK-NEXT:    [[N:%.*]] = and i32 [[M]], [[B:%.*]]
+; CHECK-NEXT:    [[ADDCONV:%.*]] = add nuw nsw i32 [[N]], 1
+; CHECK-NEXT:    [[P:%.*]] = sitofp i32 [[ADDCONV]] to double
+; CHECK-NEXT:    ret double [[P]]
+;
   %m = lshr i32 %a, 24
   %n = and i32 %m, %b
   %o = sitofp i32 %n to double
diff --git a/test/Transforms/InstCombine/add.ll b/test/Transforms/InstCombine/add.ll
index 39a746ab310b..648305d134cd 100644
--- a/test/Transforms/InstCombine/add.ll
+++ b/test/Transforms/InstCombine/add.ll
@@ -1,6 +1,32 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
+; TODO: This should be canonicalized to either a select or xor+zext.
+
+define i32 @select_0_or_1_from_bool(i1 %x) {
+; CHECK-LABEL: @select_0_or_1_from_bool(
+; CHECK-NEXT:    [[EXT:%.*]] = sext i1 %x to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[EXT]], 1
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %ext = sext i1 %x to i32
+  %add = add i32 %ext, 1
+  ret i32 %add
+}
+
+; TODO: This should be canonicalized to either a select or xor+zext.
+
+define <2 x i32> @select_0_or_1_from_bool_vec(<2 x i1> %x) {
+; CHECK-LABEL: @select_0_or_1_from_bool_vec(
+; CHECK-NEXT:    [[EXT:%.*]] = sext <2 x i1> %x to <2 x i32>
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw <2 x i32> [[EXT]], <i32 1, i32 1>
+; CHECK-NEXT:    ret <2 x i32> [[ADD]]
+;
+  %ext = sext <2 x i1> %x to <2 x i32>
+  %add = add <2 x i32> %ext, <i32 1, i32 1>
+  ret <2 x i32> %add
+}
+
 define i32 @test1(i32 %A) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    ret i32 %A
@@ -100,7 +126,7 @@ define i32 @test9(i32 %A) {
 define i1 @test10(i8 %A, i8 %b) {
 ; CHECK-LABEL: @test10(
 ; CHECK-NEXT:    [[B:%.*]] = sub i8 0, %b
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 %A, [[B]]
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 [[B]], %A
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %B = add i8 %A, %b
@@ -112,7 +138,7 @@ define i1 @test10(i8 %A, i8 %b) {
 define <2 x i1> @test10vec(<2 x i8> %a, <2 x i8> %b) {
 ; CHECK-LABEL: @test10vec(
 ; CHECK-NEXT:    [[C:%.*]] = sub <2 x i8> zeroinitializer, %b
-; CHECK-NEXT:    [[D:%.*]] = icmp ne <2 x i8> %a, [[C]]
+; CHECK-NEXT:    [[D:%.*]] = icmp ne <2 x i8> [[C]], %a
 ; CHECK-NEXT:    ret <2 x i1> [[D]]
 ;
   %c = add <2 x i8> %a, %b
@@ -244,14 +270,59 @@ define i32 @test19(i1 %C) {
   ret i32 %V
 }
 
+define <2 x i32> @test19vec(i1 %C) {
+; CHECK-LABEL: @test19vec(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], <2 x i32> <i32 1123, i32 1123>, <2 x i32> <i32 133, i32 133>
+; CHECK-NEXT:    ret <2 x i32> [[V]]
+;
+  %A = select i1 %C, <2 x i32> <i32 1000, i32 1000>, <2 x i32> <i32 10, i32 10>
+  %V = add <2 x i32> %A, <i32 123, i32 123>
+  ret <2 x i32> %V
+}
+
+; This is an InstSimplify fold, but test it here to make sure that
+; InstCombine does not prevent the fold.
+; With NSW, add of sign bit -> or of sign bit.
+
 define i32 @test20(i32 %x) {
 ; CHECK-LABEL: @test20(
 ; CHECK-NEXT:    ret i32 %x
 ;
-  %tmp.2 = xor i32 %x, -2147483648
-  ;; Add of sign bit -> xor of sign bit.
-  %tmp.4 = add i32 %tmp.2, -2147483648
-  ret i32 %tmp.4
+  %y = xor i32 %x, -2147483648
+  %z = add nsw i32 %y, -2147483648
+  ret i32 %z
+}
+
+define i32 @xor_sign_bit(i32 %x) {
+; CHECK-LABEL: @xor_sign_bit(
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 %x, -2147483606
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %xor = xor i32 %x, 2147483648
+  %add = add i32 %xor, 42
+  ret i32 %add
+}
+
+; No-wrap info allows converting the add to 'or'.
+
+define i8 @add_nsw_signbit(i8 %x) {
+; CHECK-LABEL: @add_nsw_signbit(
+; CHECK-NEXT:    [[Y:%.*]] = or i8 %x, -128
+; CHECK-NEXT:    ret i8 [[Y]]
+;
+  %y = add nsw i8 %x, -128
+  ret i8 %y
+}
+
+; No-wrap info allows converting the add to 'or'.
+
+define i8 @add_nuw_signbit(i8 %x) {
+; CHECK-LABEL: @add_nuw_signbit(
+; CHECK-NEXT:    [[Y:%.*]] = or i8 %x, -128
+; CHECK-NEXT:    ret i8 [[Y]]
+;
+  %y = add nuw i8 %x, 128
+  ret i8 %y
 }
 
 define i1 @test21(i32 %x) {
@@ -519,3 +590,99 @@ define i64 @test41(i32 %a) {
   %sub = add i64 %zext, -1
   ret i64 %sub
 }
+
+define i32 @test42(i1 %C) {
+; CHECK-LABEL: @test42(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], i32 1123, i32 133
+; CHECK-NEXT:    ret i32 [[V]]
+;
+  %A = select i1 %C, i32 1000, i32 10
+  %V = add i32 123, %A
+  ret i32 %V
+}
+
+define <2 x i32> @test42vec(i1 %C) {
+; CHECK-LABEL: @test42vec(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], <2 x i32> <i32 1123, i32 1123>, <2 x i32> <i32 133, i32 133>
+; CHECK-NEXT:    ret <2 x i32> [[V]]
+;
+  %A = select i1 %C, <2 x i32> <i32 1000, i32 1000>, <2 x i32> <i32 10, i32 10>
+  %V = add <2 x i32> <i32 123, i32 123>, %A
+  ret <2 x i32> %V
+}
+
+define <2 x i32> @test42vec2(i1 %C) {
+; CHECK-LABEL: @test42vec2(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], <2 x i32> <i32 1123, i32 2833>, <2 x i32> <i32 133, i32 363>
+; CHECK-NEXT:    ret <2 x i32> [[V]]
+;
+  %A = select i1 %C, <2 x i32> <i32 1000, i32 2500>, <2 x i32> <i32 10, i32 30>
+  %V = add <2 x i32> <i32 123, i32 333>, %A
+  ret <2 x i32> %V
+}
+
+define i32 @test55(i1 %which) {
+; CHECK-LABEL: @test55(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi i32 [ 1123, [[ENTRY:%.*]] ], [ 133, [[DELAY]] ]
+; CHECK-NEXT:    ret i32 [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi i32 [ 1000, %entry ], [ 10, %delay ]
+  %value = add i32 123, %A
+  ret i32 %value
+}
+
+define <2 x i32> @test43vec(i1 %which) {
+; CHECK-LABEL: @test43vec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi <2 x i32> [ <i32 1123, i32 1123>, [[ENTRY:%.*]] ], [ <i32 133, i32 133>, [[DELAY]] ]
+; CHECK-NEXT:    ret <2 x i32> [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi <2 x i32> [ <i32 1000, i32 1000>, %entry ], [ <i32 10, i32 10>, %delay ]
+  %value = add <2 x i32> <i32 123, i32 123>, %A
+  ret <2 x i32> %value
+}
+
+define <2 x i32> @test43vec2(i1 %which) {
+; CHECK-LABEL: @test43vec2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi <2 x i32> [ <i32 1123, i32 2833>, [[ENTRY:%.*]] ], [ <i32 133, i32 363>, [[DELAY]] ]
+; CHECK-NEXT:    ret <2 x i32> [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi <2 x i32> [ <i32 1000, i32 2500>, %entry ], [ <i32 10, i32 30>, %delay ]
+  %value = add <2 x i32> <i32 123, i32 333>, %A
+  ret <2 x i32> %value
+}
diff --git a/test/Transforms/InstCombine/alloca.ll b/test/Transforms/InstCombine/alloca.ll
index 2ee0372e5e0a..f81f700e6cf4 100644
--- a/test/Transforms/InstCombine/alloca.ll
+++ b/test/Transforms/InstCombine/alloca.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -instcombine -S -default-data-layout="E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" | FileCheck %s -check-prefix=CHECK -check-prefix=ALL
-; RUN: opt < %s -instcombine -S -default-data-layout="E-p:32:32:32-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" | FileCheck %s -check-prefix=P32 -check-prefix=ALL
+; RUN: opt < %s -instcombine -S -data-layout="E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" | FileCheck %s -check-prefix=CHECK -check-prefix=ALL
+; RUN: opt < %s -instcombine -S -data-layout="E-p:32:32:32-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" | FileCheck %s -check-prefix=P32 -check-prefix=ALL
 ; RUN: opt < %s -instcombine -S | FileCheck %s -check-prefix=NODL -check-prefix=ALL
 
 
diff --git a/test/Transforms/InstCombine/amdgcn-demanded-vector-elts.ll b/test/Transforms/InstCombine/amdgcn-demanded-vector-elts.ll
new file mode 100644
index 000000000000..888f51bf939d
--- /dev/null
+++ b/test/Transforms/InstCombine/amdgcn-demanded-vector-elts.ll
@@ -0,0 +1,322 @@
+; RUN: opt -S -instcombine %s | FileCheck %s
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.buffer.load
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @buffer_load_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  ret float %data
+}
+
+; CHECK-LABEL: @buffer_load_v1f32(
+; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret <1 x float> %data
+define amdgpu_ps <1 x float> @buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  ret <1 x float> %data
+}
+
+; CHECK-LABEL: @buffer_load_v2f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret <2 x float> %data
+define amdgpu_ps <2 x float> @buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  ret <2 x float> %data
+}
+
+; CHECK-LABEL: @buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret <4 x float> %data
+define amdgpu_ps <4 x float> @buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  ret <4 x float> %data
+}
+
+; CHECK-LABEL: @extract_elt0_buffer_load_v2f32(
+; CHECK: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt0 = extractelement <2 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_buffer_load_v2f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt1_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt1 = extractelement <2 x float> %data, i32 1
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt0_buffer_load_v4f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 1
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt1 = extractelement <4 x float> %data, i32 1
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt2_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 2
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt1 = extractelement <4 x float> %data, i32 2
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt3_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt1 = extractelement <4 x float> %data, i32 3
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret <2 x float>
+define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: ret <2 x float> %shuf
+define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt2_elt3_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: ret <2 x float> %shuf
+define amdgpu_ps <2 x float> @extract_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT: ret <3 x float> %shuf
+define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt1_elt2_elt3_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT: ret <3 x float> %shuf
+define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+  ret <3 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt2_elt3_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT: ret <3 x float> %shuf
+define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+  ret <3 x float> %shuf
+}
+
+; FIXME: Not handled even though only 2 elts used
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32_2(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %elt0 = extractelement <4 x float> %data, i32 0
+; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 1
+; CHECK-NEXT: %ins0 = insertvalue { float, float } undef, float %elt0, 0
+; CHECK-NEXT: %ins1 = insertvalue { float, float } %ins0, float %elt1, 1
+; CHECK-NEXT: ret { float, float } %ins1
+define amdgpu_ps { float, float } @extract_elt0_elt1_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  %elt1 = extractelement <4 x float> %data, i32 1
+  %ins0 = insertvalue { float, float } undef, float %elt0, 0
+  %ins1 = insertvalue { float, float } %ins0, float %elt1, 1
+  ret { float, float } %ins1
+}
+
+; CHECK-LABEL: @extract_elt0_buffer_load_v3f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt0 = extractelement <3 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_buffer_load_v3f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 1
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt1 = extractelement <3 x float> %data, i32 1
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt2_buffer_load_v3f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt1 = extractelement <3 x float> %data, i32 2
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v3f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret <2 x float>
+define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v3f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: ret <2 x float> %shuf
+define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @preserve_metadata_extract_elt0_buffer_load_v2f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @preserve_metadata_extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0
+  %elt0 = extractelement <2 x float> %data, i32 0
+  ret float %elt0
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.buffer.load.format
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @buffer_load_format_v1f32(
+; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true)
+; CHECK-NEXT: ret <1 x float> %data
+define amdgpu_ps <1 x float> @buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true)
+  ret <1 x float> %data
+}
+
+; CHECK-LABEL: @extract_elt0_buffer_load_format_v2f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false)
+  %elt0 = extractelement <2 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v3f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret <2 x float> %data
+define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret <2 x float> %data
+define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; The initial insertion point is at the extractelement
+; CHECK-LABEL: @extract01_bitcast_buffer_load_format_v4f32(
+; CHECK-NEXT: %tmp = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false)
+; CHECK-NEXT: %1 = shufflevector <2 x float> %tmp, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT: %tmp1 = bitcast <4 x float> %1 to <2 x double>
+; CHECK-NEXT: %tmp2 = extractelement <2 x double> %tmp1, i32 0
+; CHECK-NEXT: ret double %tmp2
+define double @extract01_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
+  %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
+  %tmp1 = bitcast <4 x float> %tmp to <2 x double>
+  %tmp2 = extractelement <2 x double> %tmp1, i32 0
+  ret double %tmp2
+}
+
+; CHECK-LABEL: @extract0_bitcast_buffer_load_format_v4f32(
+; CHECK-NEXT: %tmp = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false)
+; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
+; CHECK-NEXT: ret i32 %tmp2
+define i32 @extract0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
+  %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
+  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
+  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
+  ret i32 %tmp2
+}
+
+; CHECK-LABEL: @extract_lo16_0_bitcast_buffer_load_format_v4f32(
+; CHECK-NEXT: %tmp = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false)
+; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %tmp, i64 0
+; CHECK-NEXT: %tmp1 = bitcast <4 x float> %1 to <8 x i16>
+; CHECK-NEXT: %tmp2 = extractelement <8 x i16> %tmp1, i32 0
+; CHECK-NEXT: ret i16 %tmp2
+define i16 @extract_lo16_0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
+  %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
+  %tmp1 = bitcast <4 x float> %tmp to <8 x i16>
+  %tmp2 = extractelement <8 x i16> %tmp1, i32 0
+  ret i16 %tmp2
+}
+
+declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1
+
+declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+
+!0 = !{float 2.500000e+00}
diff --git a/test/Transforms/InstCombine/amdgcn-intrinsics.ll b/test/Transforms/InstCombine/amdgcn-intrinsics.ll
index a228968f25bc..deae5502bcdb 100644
--- a/test/Transforms/InstCombine/amdgcn-intrinsics.ll
+++ b/test/Transforms/InstCombine/amdgcn-intrinsics.ll
@@ -7,6 +7,12 @@
 declare float @llvm.amdgcn.rcp.f32(float) nounwind readnone
 declare double @llvm.amdgcn.rcp.f64(double) nounwind readnone
 
+; CHECK-LABEL: @test_constant_fold_rcp_f32_undef
+; CHECK-NEXT: ret float undef
+define float @test_constant_fold_rcp_f32_undef() nounwind {
+  %val = call float @llvm.amdgcn.rcp.f32(float undef) nounwind readnone
+  ret float %val
+}
 
 ; CHECK-LABEL: @test_constant_fold_rcp_f32_1
 ; CHECK-NEXT: ret float 1.000000e+00
@@ -50,6 +56,18 @@ define double @test_constant_fold_rcp_f64_43() nounwind {
   ret double %val
 }
 
+; --------------------------------------------------------------------
+; llvm.amdgcn.rsq
+; --------------------------------------------------------------------
+
+declare float @llvm.amdgcn.rsq.f32(float) nounwind readnone
+
+; CHECK-LABEL: @test_constant_fold_rsq_f32_undef
+; CHECK-NEXT: ret float undef
+define float @test_constant_fold_rsq_f32_undef() nounwind {
+  %val = call float @llvm.amdgcn.rsq.f32(float undef) nounwind readnone
+  ret float %val
+}
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.frexp.mant
@@ -633,3 +651,888 @@ define float @cos_fabs_fneg_f32(float %x) {
   %cos = call float @llvm.amdgcn.cos.f32(float %x.fabs.fneg)
   ret float %cos
 }
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pkrtz
+; --------------------------------------------------------------------
+
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) nounwind readnone
+
+; CHECK-LABEL: @vars_lhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
+define <2 x half> @vars_lhs_cvt_pkrtz(float %x, float %y) {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @constant_lhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float %y)
+define <2 x half> @constant_lhs_cvt_pkrtz(float %y) {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %y)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @constant_rhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float 0.000000e+00)
+define <2 x half> @constant_rhs_cvt_pkrtz(float %x) {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float 0.0)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @undef_lhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %y)
+define <2 x half> @undef_lhs_cvt_pkrtz(float %y) {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %y)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float undef)
+define <2 x half> @undef_rhs_cvt_pkrtz(float %x) {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float undef)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pkrtz(
+; CHECK: ret <2 x half> undef
+define <2 x half> @undef_cvt_pkrtz() {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @constant_splat0_cvt_pkrtz(
+; CHECK: ret <2 x half> zeroinitializer
+define <2 x half> @constant_splat0_cvt_pkrtz() {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float 0.0)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @constant_cvt_pkrtz(
+; CHECK: ret <2 x half> <half 0xH4000, half 0xH4400>
+define <2 x half> @constant_cvt_pkrtz() {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 2.0, float 4.0)
+  ret <2 x half> %cvt
+}
+
+; Test constant values where rtz changes result
+; CHECK-LABEL: @constant_rtz_pkrtz(
+; CHECK: ret <2 x half> <half 0xH7BFF, half 0xH7BFF>
+define <2 x half> @constant_rtz_pkrtz() {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 65535.0, float 65535.0)
+  ret <2 x half> %cvt
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.ubfe
+; --------------------------------------------------------------------
+
+declare i32 @llvm.amdgcn.ubfe.i32(i32, i32, i32) nounwind readnone
+declare i64 @llvm.amdgcn.ubfe.i64(i64, i32, i32) nounwind readnone
+
+; CHECK-LABEL: @ubfe_var_i32(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 %width)
+define i32 @ubfe_var_i32(i32 %src, i32 %offset, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_clear_high_bits_constant_offset_i32(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 5, i32 %width)
+define i32 @ubfe_clear_high_bits_constant_offset_i32(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 133, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_clear_high_bits_constant_width_i32(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 5)
+define i32 @ubfe_clear_high_bits_constant_width_i32(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 133)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_width_0(
+; CHECK-NEXT: ret i32 0
+define i32 @ubfe_width_0(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 0)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_width_31(
+; CHECK: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 31)
+define i32 @ubfe_width_31(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 31)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_width_32(
+; CHECK-NEXT: ret i32 0
+define i32 @ubfe_width_32(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 32)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_width_33(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 1)
+define i32 @ubfe_width_33(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 33)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_33(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 1, i32 %width)
+define i32 @ubfe_offset_33(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 33, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_0(
+; CHECK-NEXT: %1 = sub i32 32, %width
+; CHECK-NEXT: %2 = shl i32 %src, %1
+; CHECK-NEXT: %bfe = lshr i32 %2, %1
+; CHECK-NEXT: ret i32 %bfe
+define i32 @ubfe_offset_0(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_32(
+; CHECK-NEXT: %1 = sub i32 32, %width
+; CHECK-NEXT: %2 = shl i32 %src, %1
+; CHECK-NEXT: %bfe = lshr i32 %2, %1
+; CHECK-NEXT: ret i32 %bfe
+define i32 @ubfe_offset_32(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 32, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_31(
+; CHECK-NEXT: %1 = sub i32 32, %width
+; CHECK-NEXT: %2 = shl i32 %src, %1
+; CHECK-NEXT: %bfe = lshr i32 %2, %1
+; CHECK-NEXT: ret i32 %bfe
+define i32 @ubfe_offset_31(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 32, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_0_width_0(
+; CHECK-NEXT: ret i32 0
+define i32 @ubfe_offset_0_width_0(i32 %src) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 0)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_0_width_3(
+; CHECK-NEXT: and i32 %src, 7
+; CHECK-NEXT: ret
+define i32 @ubfe_offset_0_width_3(i32 %src) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 3)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_3_width_1(
+; CHECK-NEXT: %1 = lshr i32 %src, 3
+; CHECK-NEXT: and i32 %1, 1
+; CHECK-NEXT: ret i32
+define i32 @ubfe_offset_3_width_1(i32 %src) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 3, i32 1)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_3_width_4(
+; CHECK-NEXT: %1 = lshr i32 %src, 3
+; CHECK-NEXT: and i32 %1, 15
+; CHECK-NEXT: ret i32
+define i32 @ubfe_offset_3_width_4(i32 %src) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 3, i32 4)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_0_0_0(
+; CHECK-NEXT: ret i32 0
+define i32 @ubfe_0_0_0() {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 0)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_neg1_5_7(
+; CHECK-NEXT: ret i32 127
+define i32 @ubfe_neg1_5_7() {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 -1, i32 5, i32 7)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_undef_src_i32(
+; CHECK-NEXT: ret i32 undef
+define i32 @ubfe_undef_src_i32(i32 %offset, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 undef, i32 %offset, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_undef_offset_i32(
+; CHECK: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 undef, i32 %width)
+define i32 @ubfe_undef_offset_i32(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 undef, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_undef_width_i32(
+; CHECK: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 undef)
+define i32 @ubfe_undef_width_i32(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 undef)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_33_width_4_i64(
+; CHECK-NEXT: %1 = lshr i64 %src, 33
+; CHECK-NEXT: %bfe = and i64 %1, 15
+define i64 @ubfe_offset_33_width_4_i64(i64 %src) {
+  %bfe = call i64 @llvm.amdgcn.ubfe.i64(i64 %src, i32 33, i32 4)
+  ret i64 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_0_i64(
+; CHECK-NEXT: %1 = sub i32 64, %width
+; CHECK-NEXT: %2 = zext i32 %1 to i64
+; CHECK-NEXT: %3 = shl i64 %src, %2
+; CHECK-NEXT: %bfe = lshr i64 %3, %2
+; CHECK-NEXT: ret i64 %bfe
+define i64 @ubfe_offset_0_i64(i64 %src, i32 %width) {
+  %bfe = call i64 @llvm.amdgcn.ubfe.i64(i64 %src, i32 0, i32 %width)
+  ret i64 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_32_width_32_i64(
+; CHECK-NEXT: %bfe = lshr i64 %src, 32
+; CHECK-NEXT: ret i64 %bfe
+define i64 @ubfe_offset_32_width_32_i64(i64 %src) {
+  %bfe = call i64 @llvm.amdgcn.ubfe.i64(i64 %src, i32 32, i32 32)
+  ret i64 %bfe
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.sbfe
+; --------------------------------------------------------------------
+
+declare i32 @llvm.amdgcn.sbfe.i32(i32, i32, i32) nounwind readnone
+declare i64 @llvm.amdgcn.sbfe.i64(i64, i32, i32) nounwind readnone
+
+; CHECK-LABEL: @sbfe_offset_31(
+; CHECK-NEXT: %1 = sub i32 32, %width
+; CHECK-NEXT: %2 = shl i32 %src, %1
+; CHECK-NEXT: %bfe = ashr i32 %2, %1
+; CHECK-NEXT: ret i32 %bfe
+define i32 @sbfe_offset_31(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 32, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @sbfe_neg1_5_7(
+; CHECK-NEXT: ret i32 -1
+define i32 @sbfe_neg1_5_7() {
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 -1, i32 5, i32 7)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @sbfe_offset_32_width_32_i64(
+; CHECK-NEXT: %bfe = ashr i64 %src, 32
+; CHECK-NEXT: ret i64 %bfe
+define i64 @sbfe_offset_32_width_32_i64(i64 %src) {
+  %bfe = call i64 @llvm.amdgcn.sbfe.i64(i64 %src, i32 32, i32 32)
+  ret i64 %bfe
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.exp
+; --------------------------------------------------------------------
+
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) nounwind inaccessiblememonly
+
+; Make sure no crashing on invalid variable params
+; CHECK-LABEL: @exp_invalid_inputs(
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 %en, float 1.000000e+00, float 2.000000e+00, float 5.000000e-01, float 4.000000e+00, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 %tgt, i32 15, float 1.000000e+00, float 2.000000e+00, float 5.000000e-01, float 4.000000e+00, i1 true, i1 false)
+define void @exp_invalid_inputs(i32 %tgt, i32 %en) {
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 %en, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 %tgt, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @exp_disabled_inputs_to_undef(
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float 1.000000e+00, float undef, float undef, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float undef, float 2.000000e+00, float undef, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float undef, float undef, float 5.000000e-01, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float undef, float undef, float undef, float 4.000000e+00, i1 true, i1 false)
+
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float %x, float undef, float undef, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float undef, float %y, float undef, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float undef, float undef, float %z, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float undef, float undef, float undef, float %w, i1 true, i1 false)
+
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 false)
+
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float 1.000000e+00, float 2.000000e+00, float undef, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 5, float 1.000000e+00, float undef, float 5.000000e-01, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.000000e+00, float undef, float undef, float 4.000000e+00, i1 false, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.000000e+00, float 2.000000e+00, float 5.000000e-01, float 4.000000e+00, i1 false, i1 false)
+define void @exp_disabled_inputs_to_undef(float %x, float %y, float %z, float %w) {
+  ; enable src0..src3 constants
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+
+  ; enable src0..src3 variables
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float %x, float %y, float %z, float %w, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float %x, float %y, float %z, float %w, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float %x, float %y, float %z, float %w, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float %x, float %y, float %z, float %w, i1 true, i1 false)
+
+  ; enable none
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float %x, float %y, float %z, float %w, i1 true, i1 false)
+
+  ; enable different source combinations
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 5, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
+
+  ret void
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.exp.compr
+; --------------------------------------------------------------------
+
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) nounwind inaccessiblememonly
+
+; CHECK-LABEL: @exp_compr_invalid_inputs(
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 %en, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> <half 0xH3800, half 0xH4400>, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 %tgt, i32 5, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> <half 0xH3800, half 0xH4400>, i1 true, i1 false)
+define void @exp_compr_invalid_inputs(i32 %tgt, i32 %en) {
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 %en, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 %tgt, i32 5, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @exp_compr_disabled_inputs_to_undef(
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> undef, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> undef, i1 true, i1 false)
+
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> undef, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> %xy, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> %xy, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> %xy, <2 x half> undef, i1 true, i1 false)
+
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 12, <2 x half> undef, <2 x half> %zw, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+define void @exp_compr_disabled_inputs_to_undef(<2 x half> %xy, <2 x half> %zw) {
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 12, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+  ret void
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.fmed3
+; --------------------------------------------------------------------
+
+declare float @llvm.amdgcn.fmed3.f32(float, float, float) nounwind readnone
+
+; CHECK-LABEL: @fmed3_f32(
+; CHECK: %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float %z)
+define float @fmed3_f32(float %x, float %y, float %z) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float %z)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_x_c0_c1_f32(
+; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float 0.000000e+00, float 1.000000e+00)
+define float @fmed3_canonicalize_x_c0_c1_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 0.0, float 1.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_c0_x_c1_f32(
+; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float 0.000000e+00, float 1.000000e+00)
+define float @fmed3_canonicalize_c0_x_c1_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %x, float 1.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_c0_c1_x_f32(
+; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float 0.000000e+00, float 1.000000e+00)
+define float @fmed3_canonicalize_c0_c1_x_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %x)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_x_y_c_f32(
+; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 1.000000e+00)
+define float @fmed3_canonicalize_x_y_c_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 1.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_x_c_y_f32(
+; CHECK: %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 1.000000e+00)
+define float @fmed3_canonicalize_x_c_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 1.0, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_c_x_y_f32(
+; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 1.000000e+00)
+define float @fmed3_canonicalize_c_x_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %x, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_undef_x_y_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_undef_x_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float undef, float %x, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_fmf_undef_x_y_f32(
+; CHECK: call nnan float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_fmf_undef_x_y_f32(float %x, float %y) {
+  %med3 = call nnan float @llvm.amdgcn.fmed3.f32(float undef, float %x, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_x_undef_y_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_x_undef_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float undef, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_x_y_undef_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_x_y_undef_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float undef)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_qnan0_x_y_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_qnan0_x_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8000000000000, float %x, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_x_qnan0_y_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_x_qnan0_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 0x7FF8000000000000, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_x_y_qnan0_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_x_y_qnan0_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 0x7FF8000000000000)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_qnan1_x_y_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_qnan1_x_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8000100000000, float %x, float %y)
+  ret float %med3
+}
+
+; This can return any of the qnans.
+; CHECK-LABEL: @fmed3_qnan0_qnan1_qnan2_f32(
+; CHECK: ret float 0x7FF8002000000000
+define float @fmed3_qnan0_qnan1_qnan2_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8000100000000, float 0x7FF8002000000000, float 0x7FF8030000000000)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src0_0_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src0_0_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.5, float -1.0, float 4.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src0_1_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src0_1_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.5, float 4.0, float -1.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src1_0_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src1_0_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float -1.0, float 0.5, float 4.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src1_1_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src1_1_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 4.0, float 0.5, float -1.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src2_0_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src2_0_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float -1.0, float 4.0, float 0.5)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src2_1_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src2_1_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 4.0, float -1.0, float 0.5)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_x_qnan0_qnan1_f32(
+; CHECK: ret float %x
+define float @fmed3_x_qnan0_qnan1_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 0x7FF8001000000000, float 0x7FF8002000000000)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_qnan0_x_qnan1_f32(
+; CHECK: ret float %x
+define float @fmed3_qnan0_x_qnan1_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8001000000000, float %x, float 0x7FF8002000000000)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_qnan0_qnan1_x_f32(
+; CHECK: ret float %x
+define float @fmed3_qnan0_qnan1_x_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8001000000000, float 0x7FF8002000000000, float %x)
+  ret float %med3
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.icmp
+; --------------------------------------------------------------------
+
+declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) nounwind readnone convergent
+declare i64 @llvm.amdgcn.icmp.i64(i64, i64, i32) nounwind readnone convergent
+
+; Make sure there's no crash for invalid input
+; CHECK-LABEL: @invalid_nonconstant_icmp_code(
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 %c)
+define i64 @invalid_nonconstant_icmp_code(i32 %a, i32 %b, i32 %c) {
+  %result = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 %c)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @invalid_icmp_code(
+; CHECK: %under = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 31)
+; CHECK: %over = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 42)
+define i64 @invalid_icmp_code(i32 %a, i32 %b) {
+  %under = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 31)
+  %over = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 42)
+  %or = or i64 %under, %over
+  ret i64 %or
+}
+
+; CHECK-LABEL: @icmp_constant_inputs_false(
+; CHECK: ret i64 0
+define i64 @icmp_constant_inputs_false() {
+  %result = call i64 @llvm.amdgcn.icmp.i32(i32 9, i32 8, i32 32)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @icmp_constant_inputs_true(
+; CHECK: ret i64 -1
+define i64 @icmp_constant_inputs_true() {
+  %result = call i64 @llvm.amdgcn.icmp.i32(i32 9, i32 8, i32 34)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @icmp_constant_to_rhs_slt(
+; CHECK: %result = call i64 @llvm.amdgcn.icmp.i32(i32 %x, i32 9, i32 38)
+define i64 @icmp_constant_to_rhs_slt(i32 %x) {
+  %result = call i64 @llvm.amdgcn.icmp.i32(i32 9, i32 %x, i32 40)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_eq_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+define i64 @fold_icmp_ne_0_zext_icmp_eq_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ne_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 33)
+define i64 @fold_icmp_ne_0_zext_icmp_ne_i32(i32 %a, i32 %b) {
+  %cmp = icmp ne i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_sle_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 41)
+define i64 @fold_icmp_ne_0_zext_icmp_sle_i32(i32 %a, i32 %b) {
+  %cmp = icmp sle i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ugt_i64(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 34)
+define i64 @fold_icmp_ne_0_zext_icmp_ugt_i64(i64 %a, i64 %b) {
+  %cmp = icmp ugt i64 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ult_swap_i64(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 34)
+define i64 @fold_icmp_ne_0_zext_icmp_ult_swap_i64(i64 %a, i64 %b) {
+  %cmp = icmp ugt i64 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 0, i32 %zext.cmp, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_oeq_f32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 1)
+define i64 @fold_icmp_ne_0_zext_fcmp_oeq_f32(float %a, float %b) {
+  %cmp = fcmp oeq float %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_une_f32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 14)
+define i64 @fold_icmp_ne_0_zext_fcmp_une_f32(float %a, float %b) {
+  %cmp = fcmp une float %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_olt_f64(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f64(double %a, double %b, i32 4)
+define i64 @fold_icmp_ne_0_zext_fcmp_olt_f64(double %a, double %b) {
+  %cmp = fcmp olt double %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_icmp_ne_0_i32(
+; CHECK: %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+define i64 @fold_icmp_sext_icmp_ne_0_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %sext.cmp = sext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_eq_0_zext_icmp_eq_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 33)
+define i64 @fold_icmp_eq_0_zext_icmp_eq_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_eq_0_zext_icmp_slt_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 39)
+define i64 @fold_icmp_eq_0_zext_icmp_slt_i32(i32 %a, i32 %b) {
+  %cmp = icmp slt i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_eq_0_zext_fcmp_oeq_f32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 14)
+define i64 @fold_icmp_eq_0_zext_fcmp_oeq_f32(float %a, float %b) {
+  %cmp = fcmp oeq float %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_eq_0_zext_fcmp_ule_f32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 2)
+define i64 @fold_icmp_eq_0_zext_fcmp_ule_f32(float %a, float %b) {
+  %cmp = fcmp ule float %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_eq_0_zext_fcmp_ogt_f32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 13)
+define i64 @fold_icmp_eq_0_zext_fcmp_ogt_f32(float %a, float %b) {
+  %cmp = fcmp ogt float %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_zext_icmp_eq_1_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+define i64 @fold_icmp_zext_icmp_eq_1_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_zext_argi1_eq_1_i32(
+; CHECK: %zext.cond = zext i1 %cond to i32
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cond, i32 0, i32 33)
+define i64 @fold_icmp_zext_argi1_eq_1_i32(i1 %cond) {
+  %zext.cond = zext i1 %cond to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cond, i32 1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_zext_argi1_eq_neg1_i32(
+; CHECK: %zext.cond = zext i1 %cond to i32
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cond, i32 -1, i32 32)
+define i64 @fold_icmp_zext_argi1_eq_neg1_i32(i1 %cond) {
+  %zext.cond = zext i1 %cond to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cond, i32 -1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_argi1_eq_1_i32(
+; CHECK: %sext.cond = sext i1 %cond to i32
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cond, i32 1, i32 32)
+define i64 @fold_icmp_sext_argi1_eq_1_i32(i1 %cond) {
+  %sext.cond = sext i1 %cond to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cond, i32 1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_argi1_eq_neg1_i32(
+; CHECK: %sext.cond = sext i1 %cond to i32
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cond, i32 0, i32 33)
+define i64 @fold_icmp_sext_argi1_eq_neg1_i32(i1 %cond) {
+  %sext.cond = sext i1 %cond to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cond, i32 -1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_argi1_eq_neg1_i64(
+; CHECK: %sext.cond = sext i1 %cond to i64
+; CHECK: call i64 @llvm.amdgcn.icmp.i64(i64 %sext.cond, i64 0, i32 33)
+define i64 @fold_icmp_sext_argi1_eq_neg1_i64(i1 %cond) {
+  %sext.cond = sext i1 %cond to i64
+  %mask = call i64 @llvm.amdgcn.icmp.i64(i64 %sext.cond, i64 -1, i32 32)
+  ret i64 %mask
+}
+
+; TODO: Should be able to fold to false
+; CHECK-LABEL: @fold_icmp_sext_icmp_eq_1_i32(
+; CHECK: %cmp = icmp eq i32 %a, %b
+; CHECK: %sext.cmp = sext i1 %cmp to i32
+; CHECK: %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 1, i32 32)
+define i64 @fold_icmp_sext_icmp_eq_1_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %sext.cmp = sext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_icmp_eq_neg1_i32(
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+define i64 @fold_icmp_sext_icmp_eq_neg1_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %sext.cmp = sext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 -1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_icmp_sge_neg1_i32(
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 39)
+define i64 @fold_icmp_sext_icmp_sge_neg1_i32(i32 %a, i32 %b) {
+  %cmp = icmp sge i32 %a, %b
+  %sext.cmp = sext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 -1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_not_icmp_ne_0_zext_icmp_sle_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 38)
+define i64 @fold_not_icmp_ne_0_zext_icmp_sle_i32(i32 %a, i32 %b) {
+  %cmp = icmp sle i32 %a, %b
+  %not = xor i1 %cmp, true
+  %zext.cmp = zext i1 %not to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.fcmp
+; --------------------------------------------------------------------
+
+declare i64 @llvm.amdgcn.fcmp.f32(float, float, i32) nounwind readnone convergent
+
+; Make sure there's no crash for invalid input
+; CHECK-LABEL: @invalid_nonconstant_fcmp_code(
+; CHECK: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 %c)
+define i64 @invalid_nonconstant_fcmp_code(float %a, float %b, i32 %c) {
+  %result = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 %c)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @invalid_fcmp_code(
+; CHECK: %under = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 -1)
+; CHECK: %over = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 16)
+define i64 @invalid_fcmp_code(float %a, float %b) {
+  %under = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 -1)
+  %over = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 16)
+  %or = or i64 %under, %over
+  ret i64 %or
+}
+
+; CHECK-LABEL: @fcmp_constant_inputs_false(
+; CHECK: ret i64 0
+define i64 @fcmp_constant_inputs_false() {
+  %result = call i64 @llvm.amdgcn.fcmp.f32(float 2.0, float 4.0, i32 1)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @fcmp_constant_inputs_true(
+; CHECK: ret i64 -1
+define i64 @fcmp_constant_inputs_true() {
+  %result = call i64 @llvm.amdgcn.fcmp.f32(float 2.0, float 4.0, i32 4)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @fcmp_constant_to_rhs_olt(
+; CHECK: %result = call i64 @llvm.amdgcn.fcmp.f32(float %x, float 4.000000e+00, i32 2)
+define i64 @fcmp_constant_to_rhs_olt(float %x) {
+  %result = call i64 @llvm.amdgcn.fcmp.f32(float 4.0, float %x, i32 4)
+  ret i64 %result
+}
diff --git a/test/Transforms/InstCombine/and-or-icmps.ll b/test/Transforms/InstCombine/and-or-icmps.ll
index 3903472e9119..e3aeee293139 100644
--- a/test/Transforms/InstCombine/and-or-icmps.ll
+++ b/test/Transforms/InstCombine/and-or-icmps.ll
@@ -39,15 +39,167 @@ define i1 @PR2330(i32 %a, i32 %b) {
   ret i1 %and
 }
 
-define i1 @test(i32 %tmp1030) {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:    [[TMP1030_OFF:%.*]] = add i32 %tmp1030, -39
-; CHECK-NEXT:    [[TMP1030_CMP:%.*]] = icmp ugt i32 [[TMP1030_OFF]], 1
-; CHECK-NEXT:    ret i1 [[TMP1030_CMP]]
-;
-  %tmp1037 = icmp ne i32 %tmp1030, 39
-  %tmp1039 = icmp ne i32 %tmp1030, 40
-  %tmp1042 = and i1 %tmp1037, %tmp1039
-  ret i1 %tmp1042
+; if LHSC and RHSC differ only by one bit:
+; (X == C1 || X == C2) -> (X | (C1 ^ C2)) == C2
+; PR14708: https://bugs.llvm.org/show_bug.cgi?id=14708
+
+define i1 @or_eq_with_one_bit_diff_constants1(i32 %x) {
+; CHECK-LABEL: @or_eq_with_one_bit_diff_constants1(
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 %x, 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 51
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+  %cmp1 = icmp eq i32 %x, 50
+  %cmp2 = icmp eq i32 %x, 51
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+; (X != C1 && X != C2) -> (X | (C1 ^ C2)) != C2
+
+define i1 @and_ne_with_one_bit_diff_constants1(i32 %x) {
+; CHECK-LABEL: @and_ne_with_one_bit_diff_constants1(
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 %x, 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 51
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+  %cmp1 = icmp ne i32 %x, 51
+  %cmp2 = icmp ne i32 %x, 50
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+; The constants are not necessarily off-by-one, just off-by-one-bit.
+
+define i1 @or_eq_with_one_bit_diff_constants2(i32 %x) {
+; CHECK-LABEL: @or_eq_with_one_bit_diff_constants2(
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 %x, 32
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 97
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+  %cmp1 = icmp eq i32 %x, 97
+  %cmp2 = icmp eq i32 %x, 65
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @and_ne_with_one_bit_diff_constants2(i19 %x) {
+; CHECK-LABEL: @and_ne_with_one_bit_diff_constants2(
+; CHECK-NEXT:    [[TMP1:%.*]] = or i19 %x, 128
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i19 [[TMP1]], 193
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+  %cmp1 = icmp ne i19 %x, 65
+  %cmp2 = icmp ne i19 %x, 193
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+; Make sure the constants are treated as unsigned when comparing them.
+
+define i1 @or_eq_with_one_bit_diff_constants3(i8 %x) {
+; CHECK-LABEL: @or_eq_with_one_bit_diff_constants3(
+; CHECK-NEXT:    [[TMP1:%.*]] = or i8 %x, -128
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], -2
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+  %cmp1 = icmp eq i8 %x, 254
+  %cmp2 = icmp eq i8 %x, 126
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @and_ne_with_one_bit_diff_constants3(i8 %x) {
+; CHECK-LABEL: @and_ne_with_one_bit_diff_constants3(
+; CHECK-NEXT:    [[TMP1:%.*]] = or i8 %x, -128
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i8 [[TMP1]], -63
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+  %cmp1 = icmp ne i8 %x, 65
+  %cmp2 = icmp ne i8 %x, 193
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+; Use an 'add' to eliminate an icmp if the constants are off-by-one (not off-by-one-bit).
+; (X == 13 | X == 14) -> X-13 <u 2
+
+define i1 @or_eq_with_diff_one(i8 %x) {
+; CHECK-LABEL: @or_eq_with_diff_one(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 %x, -13
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i8 [[TMP1]], 2
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+  %cmp1 = icmp eq i8 %x, 13
+  %cmp2 = icmp eq i8 %x, 14
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+; (X != 40 | X != 39) -> X-39 >u 1
+
+define i1 @and_ne_with_diff_one(i32 %x) {
+; CHECK-LABEL: @and_ne_with_diff_one(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 %x, -39
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt i32 [[TMP1]], 1
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+  %cmp1 = icmp ne i32 %x, 40
+  %cmp2 = icmp ne i32 %x, 39
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+; Make sure the constants are treated as signed when comparing them.
+; PR32524: https://bugs.llvm.org/show_bug.cgi?id=32524
+
+define i1 @or_eq_with_diff_one_signed(i32 %x) {
+; CHECK-LABEL: @or_eq_with_diff_one_signed(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 %x, 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[TMP1]], 2
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+  %cmp1 = icmp eq i32 %x, 0
+  %cmp2 = icmp eq i32 %x, -1
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @and_ne_with_diff_one_signed(i64 %x) {
+; CHECK-LABEL: @and_ne_with_diff_one_signed(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 %x, 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt i64 [[TMP1]], 1
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+  %cmp1 = icmp ne i64 %x, -1
+  %cmp2 = icmp ne i64 %x, 0
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+; Vectors with splat constants get the same folds.
+
+define <2 x i1> @or_eq_with_one_bit_diff_constants2_splatvec(<2 x i32> %x) {
+; CHECK-LABEL: @or_eq_with_one_bit_diff_constants2_splatvec(
+; CHECK-NEXT:    [[TMP1:%.*]] = or <2 x i32> %x, <i32 32, i32 32>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <2 x i32> [[TMP1]], <i32 97, i32 97>
+; CHECK-NEXT:    ret <2 x i1> [[TMP2]]
+;
+  %cmp1 = icmp eq <2 x i32> %x, <i32 97, i32 97>
+  %cmp2 = icmp eq <2 x i32> %x, <i32 65, i32 65>
+  %or = or <2 x i1> %cmp1, %cmp2
+  ret <2 x i1> %or
+}
+
+define <2 x i1> @and_ne_with_diff_one_splatvec(<2 x i32> %x) {
+; CHECK-LABEL: @and_ne_with_diff_one_splatvec(
+; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> %x, <i32 -39, i32 -39>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt <2 x i32> [[TMP1]], <i32 1, i32 1>
+; CHECK-NEXT:    ret <2 x i1> [[TMP2]]
+;
+  %cmp1 = icmp ne <2 x i32> %x, <i32 40, i32 40>
+  %cmp2 = icmp ne <2 x i32> %x, <i32 39, i32 39>
+  %and = and <2 x i1> %cmp1, %cmp2
+  ret <2 x i1> %and
 }
 
diff --git a/test/Transforms/InstCombine/and.ll b/test/Transforms/InstCombine/and.ll
index e45012878ed5..9a4d1e5758b3 100644
--- a/test/Transforms/InstCombine/and.ll
+++ b/test/Transforms/InstCombine/and.ll
@@ -176,7 +176,7 @@ define i8 @test16(i8 %A) {
 define i8 @test17(i8 %X, i8 %Y) {
 ; CHECK-LABEL: @test17(
 ; CHECK-NEXT:    [[Y_NOT:%.*]] = xor i8 %Y, -1
-; CHECK-NEXT:    [[D:%.*]] = or i8 %X, [[Y_NOT]]
+; CHECK-NEXT:    [[D:%.*]] = or i8 [[Y_NOT]], %X
 ; CHECK-NEXT:    ret i8 [[D]]
 ;
   %B = xor i8 %X, -1
@@ -311,19 +311,6 @@ define <2 x i1> @test25vec(<2 x i32> %A) {
   ret <2 x i1> %D
 }
 
-define i1 @test26(i32 %A) {
-; CHECK-LABEL: @test26(
-; CHECK-NEXT:    [[A_OFF:%.*]] = add i32 %A, -49
-; CHECK-NEXT:    [[A_CMP:%.*]] = icmp ugt i32 [[A_OFF]], 1
-; CHECK-NEXT:    ret i1 [[A_CMP]]
-;
-  %B = icmp ne i32 %A, 49
-  %C = icmp ne i32 %A, 50
-  ;; (A-49) > 1
-  %D = and i1 %B, %C
-  ret i1 %D
-}
-
 define i8 @test27(i8 %A) {
 ; CHECK-LABEL: @test27(
 ; CHECK-NEXT:    ret i8 0
@@ -382,6 +369,18 @@ define i32 @test31(i1 %X) {
   ret i32 %A
 }
 
+; Demanded bit analysis allows us to eliminate the add.
+
+define <2 x i32> @and_demanded_bits_splat_vec(<2 x i32> %x) {
+; CHECK-LABEL: @and_demanded_bits_splat_vec(
+; CHECK-NEXT:    [[Z:%.*]] = and <2 x i32> %x, <i32 7, i32 7>
+; CHECK-NEXT:    ret <2 x i32> [[Z]]
+;
+  %y = add <2 x i32> %x, <i32 8, i32 8>
+  %z = and <2 x i32> %y, <i32 7, i32 7>
+  ret <2 x i32> %z
+}
+
 define i32 @test32(i32 %In) {
 ; CHECK-LABEL: @test32(
 ; CHECK-NEXT:    ret i32 0
@@ -405,6 +404,42 @@ define i32 @test33(i32 %b) {
   ret i32 %tmp.13
 }
 
+define i32 @test33b(i32 %b) {
+; CHECK-LABEL: @test33b(
+; CHECK-NEXT:    [[TMP_13:%.*]] = xor i32 [[B:%.*]], 1
+; CHECK-NEXT:    ret i32 [[TMP_13]]
+;
+  %tmp.4.mask = and i32 %b, 1
+  %tmp.10 = xor i32 %tmp.4.mask, 1
+  %tmp.12 = and i32 %b, -2
+  %tmp.13 = or i32 %tmp.10, %tmp.12
+  ret i32 %tmp.13
+}
+
+define <2 x i32> @test33vec(<2 x i32> %b) {
+; CHECK-LABEL: @test33vec(
+; CHECK-NEXT:    [[TMP_13:%.*]] = xor <2 x i32> [[B:%.*]], <i32 1, i32 1>
+; CHECK-NEXT:    ret <2 x i32> [[TMP_13]]
+;
+  %tmp.4.mask = and <2 x i32> %b, <i32 1, i32 1>
+  %tmp.10 = xor <2 x i32> %tmp.4.mask, <i32 1, i32 1>
+  %tmp.12 = and <2 x i32> %b, <i32 -2, i32 -2>
+  %tmp.13 = or <2 x i32> %tmp.12, %tmp.10
+  ret <2 x i32> %tmp.13
+}
+
+define <2 x i32> @test33vecb(<2 x i32> %b) {
+; CHECK-LABEL: @test33vecb(
+; CHECK-NEXT:    [[TMP_13:%.*]] = xor <2 x i32> [[B:%.*]], <i32 1, i32 1>
+; CHECK-NEXT:    ret <2 x i32> [[TMP_13]]
+;
+  %tmp.4.mask = and <2 x i32> %b, <i32 1, i32 1>
+  %tmp.10 = xor <2 x i32> %tmp.4.mask, <i32 1, i32 1>
+  %tmp.12 = and <2 x i32> %b, <i32 -2, i32 -2>
+  %tmp.13 = or <2 x i32> %tmp.10, %tmp.12
+  ret <2 x i32> %tmp.13
+}
+
 define i32 @test34(i32 %A, i32 %B) {
 ; CHECK-LABEL: @test34(
 ; CHECK-NEXT:    ret i32 %B
@@ -425,3 +460,156 @@ define <2 x i32> @PR24942(<2 x i32> %x) {
   ret <2 x i32> %and
 }
 
+define i64 @test35(i32 %X) {
+; CHECK-LABEL: @test35(
+; CHECK-NEXT:  %[[sub:.*]] = sub i32 0, %X
+; CHECK-NEXT:  %[[and:.*]] = and i32 %[[sub]], 240
+; CHECK-NEXT:  %[[cst:.*]] = zext i32 %[[and]] to i64
+; CHECK-NEXT:  ret i64 %[[cst]]
+  %zext = zext i32 %X to i64
+  %zsub = sub i64 0, %zext
+  %res = and i64 %zsub, 240
+  ret i64 %res
+}
+
+define i64 @test36(i32 %X) {
+; CHECK-LABEL: @test36(
+; CHECK-NEXT:  %[[sub:.*]] = add i32 %X, 7
+; CHECK-NEXT:  %[[and:.*]] = and i32 %[[sub]], 240
+; CHECK-NEXT:  %[[cst:.*]] = zext i32 %[[and]] to i64
+; CHECK-NEXT:  ret i64 %[[cst]]
+  %zext = zext i32 %X to i64
+  %zsub = add i64 %zext, 7
+  %res = and i64 %zsub, 240
+  ret i64 %res
+}
+
+define i64 @test37(i32 %X) {
+; CHECK-LABEL: @test37(
+; CHECK-NEXT:  %[[sub:.*]] = mul i32 %X, 7
+; CHECK-NEXT:  %[[and:.*]] = and i32 %[[sub]], 240
+; CHECK-NEXT:  %[[cst:.*]] = zext i32 %[[and]] to i64
+; CHECK-NEXT:  ret i64 %[[cst]]
+  %zext = zext i32 %X to i64
+  %zsub = mul i64 %zext, 7
+  %res = and i64 %zsub, 240
+  ret i64 %res
+}
+
+define i64 @test38(i32 %X) {
+; CHECK-LABEL: @test38(
+; CHECK-NEXT:  %[[and:.*]] = and i32 %X, 240
+; CHECK-NEXT:  %[[cst:.*]] = zext i32 %[[and]] to i64
+; CHECK-NEXT:  ret i64 %[[cst]]
+  %zext = zext i32 %X to i64
+  %zsub = xor i64 %zext, 7
+  %res = and i64 %zsub, 240
+  ret i64 %res
+}
+
+define i64 @test39(i32 %X) {
+; CHECK-LABEL: @test39(
+; CHECK-NEXT:  %[[and:.*]] = and i32 %X, 240
+; CHECK-NEXT:  %[[cst:.*]] = zext i32 %[[and]] to i64
+; CHECK-NEXT:  ret i64 %[[cst]]
+  %zext = zext i32 %X to i64
+  %zsub = or i64 %zext, 7
+  %res = and i64 %zsub, 240
+  ret i64 %res
+}
+
+define i32 @test40(i1 %C) {
+; CHECK-LABEL: @test40(
+; CHECK-NEXT:    [[A:%.*]] = select i1 [[C:%.*]], i32 104, i32 10
+; CHECK-NEXT:    ret i32 [[A]]
+;
+  %A = select i1 %C, i32 1000, i32 10
+  %V = and i32 %A, 123
+  ret i32 %V
+}
+
+define <2 x i32> @test40vec(i1 %C) {
+; CHECK-LABEL: @test40vec(
+; CHECK-NEXT:    [[A:%.*]] = select i1 [[C:%.*]], <2 x i32> <i32 104, i32 104>, <2 x i32> <i32 10, i32 10>
+; CHECK-NEXT:    ret <2 x i32> [[A]]
+;
+  %A = select i1 %C, <2 x i32> <i32 1000, i32 1000>, <2 x i32> <i32 10, i32 10>
+  %V = and <2 x i32> %A, <i32 123, i32 123>
+  ret <2 x i32> %V
+}
+
+define <2 x i32> @test40vec2(i1 %C) {
+; CHECK-LABEL: @test40vec2(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], <2 x i32> <i32 104, i32 324>, <2 x i32> <i32 10, i32 12>
+; CHECK-NEXT:    ret <2 x i32> [[V]]
+;
+  %A = select i1 %C, <2 x i32> <i32 1000, i32 2500>, <2 x i32> <i32 10, i32 30>
+  %V = and <2 x i32> %A, <i32 123, i32 333>
+  ret <2 x i32> %V
+}
+
+define i32 @test41(i1 %which) {
+; CHECK-LABEL: @test41(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi i32 [ 104, [[ENTRY:%.*]] ], [ 10, [[DELAY]] ]
+; CHECK-NEXT:    ret i32 [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi i32 [ 1000, %entry ], [ 10, %delay ]
+  %value = and i32 %A, 123
+  ret i32 %value
+}
+
+define <2 x i32> @test41vec(i1 %which) {
+; CHECK-LABEL: @test41vec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi <2 x i32> [ <i32 104, i32 104>, [[ENTRY:%.*]] ], [ <i32 10, i32 10>, [[DELAY]] ]
+; CHECK-NEXT:    ret <2 x i32> [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi <2 x i32> [ <i32 1000, i32 1000>, %entry ], [ <i32 10, i32 10>, %delay ]
+  %value = and <2 x i32> %A, <i32 123, i32 123>
+  ret <2 x i32> %value
+}
+
+define <2 x i32> @test41vec2(i1 %which) {
+; CHECK-LABEL: @test41vec2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi <2 x i32> [ <i32 104, i32 324>, [[ENTRY:%.*]] ], [ <i32 10, i32 12>, [[DELAY]] ]
+; CHECK-NEXT:    ret <2 x i32> [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi <2 x i32> [ <i32 1000, i32 2500>, %entry ], [ <i32 10, i32 30>, %delay ]
+  %value = and <2 x i32> %A, <i32 123, i32 333>
+  ret <2 x i32> %value
+}
diff --git a/test/Transforms/InstCombine/and2.ll b/test/Transforms/InstCombine/and2.ll
index 3d043b0864cd..001ac58891e4 100644
--- a/test/Transforms/InstCombine/and2.ll
+++ b/test/Transforms/InstCombine/and2.ll
@@ -45,21 +45,6 @@ define <4 x i32> @test5(<4 x i32> %A) {
   ret <4 x i32> %2
 }
 
-; Check that we combine "if x!=0 && x!=-1" into "if x+1u>1"
-define i32 @test6(i64 %x) nounwind {
-; CHECK-LABEL: @test6(
-; CHECK-NEXT:    [[X_OFF:%.*]] = add i64 %x, 1
-; CHECK-NEXT:    [[X_CMP:%.*]] = icmp ugt i64 [[X_OFF]], 1
-; CHECK-NEXT:    [[LAND_EXT:%.*]] = zext i1 [[X_CMP]] to i32
-; CHECK-NEXT:    ret i32 [[LAND_EXT]]
-;
-  %cmp1 = icmp ne i64 %x, -1
-  %not.cmp = icmp ne i64 %x, 0
-  %.cmp1 = and i1 %cmp1, %not.cmp
-  %land.ext = zext i1 %.cmp1 to i32
-  ret i32 %land.ext
-}
-
 define i1 @test7(i32 %i, i1 %b) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 %i, 0
@@ -110,6 +95,18 @@ define i64 @test9(i64 %x) {
   ret i64 %and
 }
 
+; combine -x & 1 into x & 1
+define <2 x i64> @test9vec(<2 x i64> %x) {
+; CHECK-LABEL: @test9vec(
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw <2 x i64> zeroinitializer, [[X:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and <2 x i64> [[SUB]], <i64 1, i64 1>
+; CHECK-NEXT:    ret <2 x i64> [[AND]]
+;
+  %sub = sub nsw <2 x i64> <i64 0, i64 0>, %x
+  %and = and <2 x i64> %sub, <i64 1, i64 1>
+  ret <2 x i64> %and
+}
+
 define i64 @test10(i64 %x) {
 ; CHECK-LABEL: @test10(
 ; CHECK-NEXT:    [[AND:%.*]] = and i64 %x, 1
@@ -122,3 +119,63 @@ define i64 @test10(i64 %x) {
   ret i64 %add
 }
 
+; The add in this test is unnecessary because the LSBs of the LHS are 0 and the 'and' only consumes bits from those LSBs. It doesn't matter what happens to the upper bits.
+define i32 @test11(i32 %a, i32 %b) {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[X:%.*]] = shl i32 [[A:%.*]], 8
+; CHECK-NEXT:    [[Z:%.*]] = and i32 [[B:%.*]], 128
+; CHECK-NEXT:    [[W:%.*]] = mul i32 [[Z]], [[X]]
+; CHECK-NEXT:    ret i32 [[W]]
+;
+  %x = shl i32 %a, 8
+  %y = add i32 %x, %b
+  %z = and i32 %y, 128
+  %w = mul i32 %z, %x ; to keep the shift from being removed
+  ret i32 %w
+}
+
+; The add in this test is unnecessary because the LSBs of the RHS are 0 and the 'and' only consumes bits from those LSBs. It doesn't matter what happens to the upper bits.
+define i32 @test12(i32 %a, i32 %b) {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:    [[X:%.*]] = shl i32 [[A:%.*]], 8
+; CHECK-NEXT:    [[Z:%.*]] = and i32 [[B:%.*]], 128
+; CHECK-NEXT:    [[W:%.*]] = mul i32 [[Z]], [[X]]
+; CHECK-NEXT:    ret i32 [[W]]
+;
+  %x = shl i32 %a, 8
+  %y = add i32 %b, %x
+  %z = and i32 %y, 128
+  %w = mul i32 %z, %x ; to keep the shift from being removed
+  ret i32 %w
+}
+
+; The sub in this test is unnecessary because the LSBs of the RHS are 0 and the 'and' only consumes bits from those LSBs. It doesn't matter what happens to the upper bits.
+define i32 @test13(i32 %a, i32 %b) {
+; CHECK-LABEL: @test13(
+; CHECK-NEXT:    [[X:%.*]] = shl i32 [[A:%.*]], 8
+; CHECK-NEXT:    [[Z:%.*]] = and i32 [[B:%.*]], 128
+; CHECK-NEXT:    [[W:%.*]] = mul i32 [[Z]], [[X]]
+; CHECK-NEXT:    ret i32 [[W]]
+;
+  %x = shl i32 %a, 8
+  %y = sub i32 %b, %x
+  %z = and i32 %y, 128
+  %w = mul i32 %z, %x ; to keep the shift from being removed
+  ret i32 %w
+}
+
+; The sub in this test cannot be removed because we need to keep the negation of %b. TODO: But we should be able to replace the LHS of it with a 0.
+define i32 @test14(i32 %a, i32 %b) {
+; CHECK-LABEL: @test14(
+; CHECK-NEXT:    [[X:%.*]] = shl i32 [[A:%.*]], 8
+; CHECK-NEXT:    [[Y:%.*]] = sub i32 0, [[B:%.*]]
+; CHECK-NEXT:    [[Z:%.*]] = and i32 [[Y]], 128
+; CHECK-NEXT:    [[W:%.*]] = mul i32 [[Z]], [[X]]
+; CHECK-NEXT:    ret i32 [[W]]
+;
+  %x = shl i32 %a, 8
+  %y = sub i32 %x, %b
+  %z = and i32 %y, 128
+  %w = mul i32 %z, %x ; to keep the shift from being removed
+  ret i32 %w
+}
diff --git a/test/Transforms/InstCombine/apint-shift.ll b/test/Transforms/InstCombine/apint-shift.ll
index e1e6b7c48c47..f339de35d77c 100644
--- a/test/Transforms/InstCombine/apint-shift.ll
+++ b/test/Transforms/InstCombine/apint-shift.ll
@@ -63,6 +63,8 @@ define i55 @test6(i55 %A) {
   ret i55 %C
 }
 
+; (X * C2) << C1 --> X * (C2 << C1)
+
 define i55 @test6a(i55 %A) {
 ; CHECK-LABEL: @test6a(
 ; CHECK-NEXT:    [[C:%.*]] = mul i55 %A, 6
@@ -73,6 +75,18 @@ define i55 @test6a(i55 %A) {
   ret i55 %C
 }
 
+; (X * C2) << C1 --> X * (C2 << C1)
+
+define <2 x i55> @test6a_vec(<2 x i55> %A) {
+; CHECK-LABEL: @test6a_vec(
+; CHECK-NEXT:    [[C:%.*]] = mul <2 x i55> %A, <i55 6, i55 48>
+; CHECK-NEXT:    ret <2 x i55> [[C]]
+;
+  %B = mul <2 x i55> %A, <i55 3, i55 12>
+  %C = shl <2 x i55> %B, <i55 1, i55 2>
+  ret <2 x i55> %C
+}
+
 define i29 @test7(i8 %X) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:    ret i29 -1
@@ -101,14 +115,150 @@ define i17 @test9(i17 %A) {
   ret i17 %C
 }
 
-define i19 @test10(i19 %A) {
+; shl (lshr X, C), C --> and X, C'
+
+define i19 @test10(i19 %X) {
 ; CHECK-LABEL: @test10(
-; CHECK-NEXT:    [[B:%.*]] = and i19 %A, -262144
-; CHECK-NEXT:    ret i19 [[B]]
+; CHECK-NEXT:    [[SH1:%.*]] = and i19 %X, -262144
+; CHECK-NEXT:    ret i19 [[SH1]]
+;
+  %sh1 = lshr i19 %X, 18
+  %sh2 = shl i19 %sh1, 18
+  ret i19 %sh2
+}
+
+; Two right shifts in the same direction:
+; lshr (lshr X, C1), C2 --> lshr X, C1 + C2
+
+define <2 x i19> @lshr_lshr_splat_vec(<2 x i19> %X) {
+; CHECK-LABEL: @lshr_lshr_splat_vec(
+; CHECK-NEXT:    [[SH1:%.*]] = lshr <2 x i19> %X, <i19 5, i19 5>
+; CHECK-NEXT:    ret <2 x i19> [[SH1]]
+;
+  %sh1 = lshr <2 x i19> %X, <i19 3, i19 3>
+  %sh2 = lshr <2 x i19> %sh1, <i19 2, i19 2>
+  ret <2 x i19> %sh2
+}
+
+define i9 @multiuse_lshr_lshr(i9 %x) {
+; CHECK-LABEL: @multiuse_lshr_lshr(
+; CHECK-NEXT:    [[SH1:%.*]] = lshr i9 %x, 2
+; CHECK-NEXT:    [[SH2:%.*]] = lshr i9 %x, 5
+; CHECK-NEXT:    [[MUL:%.*]] = mul i9 [[SH1]], [[SH2]]
+; CHECK-NEXT:    ret i9 [[MUL]]
+;
+  %sh1 = lshr i9 %x, 2
+  %sh2 = lshr i9 %sh1, 3
+  %mul = mul i9 %sh1, %sh2
+  ret i9 %mul
+}
+
+define <2 x i9> @multiuse_lshr_lshr_splat(<2 x i9> %x) {
+; CHECK-LABEL: @multiuse_lshr_lshr_splat(
+; CHECK-NEXT:    [[SH1:%.*]] = lshr <2 x i9> %x, <i9 2, i9 2>
+; CHECK-NEXT:    [[SH2:%.*]] = lshr <2 x i9> %x, <i9 5, i9 5>
+; CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i9> [[SH1]], [[SH2]]
+; CHECK-NEXT:    ret <2 x i9> [[MUL]]
+;
+  %sh1 = lshr <2 x i9> %x, <i9 2, i9 2>
+  %sh2 = lshr <2 x i9> %sh1, <i9 3, i9 3>
+  %mul = mul <2 x i9> %sh1, %sh2
+  ret <2 x i9> %mul
+}
+
+; Two left shifts in the same direction:
+; shl (shl X, C1), C2 -->  shl X, C1 + C2
+
+define <2 x i19> @shl_shl_splat_vec(<2 x i19> %X) {
+; CHECK-LABEL: @shl_shl_splat_vec(
+; CHECK-NEXT:    [[SH1:%.*]] = shl <2 x i19> %X, <i19 5, i19 5>
+; CHECK-NEXT:    ret <2 x i19> [[SH1]]
+;
+  %sh1 = shl <2 x i19> %X, <i19 3, i19 3>
+  %sh2 = shl <2 x i19> %sh1, <i19 2, i19 2>
+  ret <2 x i19> %sh2
+}
+
+define i42 @multiuse_shl_shl(i42 %x) {
+; CHECK-LABEL: @multiuse_shl_shl(
+; CHECK-NEXT:    [[SH1:%.*]] = shl i42 %x, 8
+; CHECK-NEXT:    [[SH2:%.*]] = shl i42 %x, 17
+; CHECK-NEXT:    [[MUL:%.*]] = mul i42 [[SH1]], [[SH2]]
+; CHECK-NEXT:    ret i42 [[MUL]]
+;
+  %sh1 = shl i42 %x, 8
+  %sh2 = shl i42 %sh1, 9
+  %mul = mul i42 %sh1, %sh2
+  ret i42 %mul
+}
+
+define <2 x i42> @multiuse_shl_shl_splat(<2 x i42> %x) {
+; CHECK-LABEL: @multiuse_shl_shl_splat(
+; CHECK-NEXT:    [[SH1:%.*]] = shl <2 x i42> %x, <i42 8, i42 8>
+; CHECK-NEXT:    [[SH2:%.*]] = shl <2 x i42> %x, <i42 17, i42 17>
+; CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i42> [[SH1]], [[SH2]]
+; CHECK-NEXT:    ret <2 x i42> [[MUL]]
+;
+  %sh1 = shl <2 x i42> %x, <i42 8, i42 8>
+  %sh2 = shl <2 x i42> %sh1, <i42 9, i42 9>
+  %mul = mul <2 x i42> %sh1, %sh2
+  ret <2 x i42> %mul
+}
+
+; Equal shift amounts in opposite directions become bitwise 'and':
+; lshr (shl X, C), C --> and X, C'
+
+define <2 x i19> @eq_shl_lshr_splat_vec(<2 x i19> %X) {
+; CHECK-LABEL: @eq_shl_lshr_splat_vec(
+; CHECK-NEXT:    [[SH1:%.*]] = and <2 x i19> %X, <i19 65535, i19 65535>
+; CHECK-NEXT:    ret <2 x i19> [[SH1]]
+;
+  %sh1 = shl <2 x i19> %X, <i19 3, i19 3>
+  %sh2 = lshr <2 x i19> %sh1, <i19 3, i19 3>
+  ret <2 x i19> %sh2
+}
+
+; Equal shift amounts in opposite directions become bitwise 'and':
+; shl (lshr X, C), C --> and X, C'
+
+define <2 x i19> @eq_lshr_shl_splat_vec(<2 x i19> %X) {
+; CHECK-LABEL: @eq_lshr_shl_splat_vec(
+; CHECK-NEXT:    [[SH1:%.*]] = and <2 x i19> %X, <i19 -8, i19 -8>
+; CHECK-NEXT:    ret <2 x i19> [[SH1]]
+;
+  %sh1 = lshr <2 x i19> %X, <i19 3, i19 3>
+  %sh2 = shl <2 x i19> %sh1, <i19 3, i19 3>
+  ret <2 x i19> %sh2
+}
+
+; In general, we would need an 'and' for this transform, but the masked-off bits are known zero.
+; shl (lshr X, C1), C2 --> lshr X, C1 - C2
+
+define <2 x i7> @lshr_shl_splat_vec(<2 x i7> %X) {
+; CHECK-LABEL: @lshr_shl_splat_vec(
+; CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i7> %X, <i7 -8, i7 -8>
+; CHECK-NEXT:    [[SH1:%.*]] = lshr exact <2 x i7> [[MUL]], <i7 1, i7 1>
+; CHECK-NEXT:    ret <2 x i7> [[SH1]]
+;
+  %mul = mul <2 x i7> %X, <i7 -8, i7 -8>
+  %sh1 = lshr exact <2 x i7> %mul, <i7 3, i7 3>
+  %sh2 = shl nuw nsw <2 x i7> %sh1, <i7 2, i7 2>
+  ret <2 x i7> %sh2
+}
+
+; In general, we would need an 'and' for this transform, but the masked-off bits are known zero.
+; lshr (shl X, C1), C2 -->  shl X, C1 - C2
+
+define <2 x i7> @shl_lshr_splat_vec(<2 x i7> %X) {
+; CHECK-LABEL: @shl_lshr_splat_vec(
+; CHECK-NEXT:    [[DIV:%.*]] = udiv <2 x i7> %X, <i7 9, i7 9>
+; CHECK-NEXT:    [[SH1:%.*]] = shl nuw nsw <2 x i7> [[DIV]], <i7 1, i7 1>
+; CHECK-NEXT:    ret <2 x i7> [[SH1]]
 ;
-  %B = lshr i19 %A, 18
-  %C = shl i19 %B, 18
-  ret i19 %C
+  %div = udiv <2 x i7> %X, <i7 9, i7 9>
+  %sh1 = shl nuw <2 x i7> %div, <i7 3, i7 3>
+  %sh2 = lshr exact <2 x i7> %sh1, <i7 2, i7 2>
+  ret <2 x i7> %sh2
 }
 
 ; Don't hide the shl from scalar evolution. DAGCombine will get it.
@@ -125,14 +275,29 @@ define i23 @test11(i23 %A) {
   ret i23 %C
 }
 
-define i47 @test12(i47 %A) {
+; shl (ashr X, C), C --> and X, C'
+
+define i47 @test12(i47 %X) {
 ; CHECK-LABEL: @test12(
-; CHECK-NEXT:    [[B1:%.*]] = and i47 %A, -256
-; CHECK-NEXT:    ret i47 [[B1]]
+; CHECK-NEXT:    [[SH11:%.*]] = and i47 %X, -256
+; CHECK-NEXT:    ret i47 [[SH11]]
+;
+  %sh1 = ashr i47 %X, 8
+  %sh2 = shl i47 %sh1, 8
+  ret i47 %sh2
+}
+
+; FIXME: Same as above with vectors.
+
+define <2 x i47> @test12_splat_vec(<2 x i47> %X) {
+; CHECK-LABEL: @test12_splat_vec(
+; CHECK-NEXT:    [[SH1:%.*]] = ashr <2 x i47> %X, <i47 8, i47 8>
+; CHECK-NEXT:    [[SH2:%.*]] = shl nsw <2 x i47> [[SH1]], <i47 8, i47 8>
+; CHECK-NEXT:    ret <2 x i47> [[SH2]]
 ;
-  %B = ashr i47 %A, 8
-  %C = shl i47 %B, 8
-  ret i47 %C
+  %sh1 = ashr <2 x i47> %X, <i47 8, i47 8>
+  %sh2 = shl <2 x i47> %sh1, <i47 8, i47 8>
+  ret <2 x i47> %sh2
 }
 
 ; Don't hide the shl from scalar evolution. DAGCombine will get it.
@@ -330,6 +495,66 @@ define i11 @test23(i44 %A) {
   ret i11 %D
 }
 
+; Fold lshr (shl X, C), C -> and X, C' regardless of the number of uses of the shl.
+
+define i44 @shl_lshr_eq_amt_multi_use(i44 %A) {
+; CHECK-LABEL: @shl_lshr_eq_amt_multi_use(
+; CHECK-NEXT:    [[B:%.*]] = shl i44 %A, 33
+; CHECK-NEXT:    [[C:%.*]] = and i44 %A, 2047
+; CHECK-NEXT:    [[D:%.*]] = or i44 [[B]], [[C]]
+; CHECK-NEXT:    ret i44 [[D]]
+;
+  %B = shl i44 %A, 33
+  %C = lshr i44 %B, 33
+  %D = add i44 %B, %C
+  ret i44 %D
+}
+
+; Fold vector lshr (shl X, C), C -> and X, C' regardless of the number of uses of the shl.
+
+define <2 x i44> @shl_lshr_eq_amt_multi_use_splat_vec(<2 x i44> %A) {
+; CHECK-LABEL: @shl_lshr_eq_amt_multi_use_splat_vec(
+; CHECK-NEXT:    [[B:%.*]] = shl <2 x i44> %A, <i44 33, i44 33>
+; CHECK-NEXT:    [[C:%.*]] = and <2 x i44> %A, <i44 2047, i44 2047>
+; CHECK-NEXT:    [[D:%.*]] = or <2 x i44> [[B]], [[C]]
+; CHECK-NEXT:    ret <2 x i44> [[D]]
+;
+  %B = shl <2 x i44> %A, <i44 33, i44 33>
+  %C = lshr <2 x i44> %B, <i44 33, i44 33>
+  %D = add <2 x i44> %B, %C
+  ret <2 x i44> %D
+}
+
+; Fold shl (lshr X, C), C -> and X, C' regardless of the number of uses of the lshr.
+
+define i43 @lshr_shl_eq_amt_multi_use(i43 %A) {
+; CHECK-LABEL: @lshr_shl_eq_amt_multi_use(
+; CHECK-NEXT:    [[B:%.*]] = lshr i43 %A, 23
+; CHECK-NEXT:    [[C:%.*]] = and i43 %A, -8388608
+; CHECK-NEXT:    [[D:%.*]] = mul i43 [[B]], [[C]]
+; CHECK-NEXT:    ret i43 [[D]]
+;
+  %B = lshr i43 %A, 23
+  %C = shl i43 %B, 23
+  %D = mul i43 %B, %C
+  ret i43 %D
+}
+
+; Fold vector shl (lshr X, C), C -> and X, C' regardless of the number of uses of the lshr.
+
+define <2 x i43> @lshr_shl_eq_amt_multi_use_splat_vec(<2 x i43> %A) {
+; CHECK-LABEL: @lshr_shl_eq_amt_multi_use_splat_vec(
+; CHECK-NEXT:    [[B:%.*]] = lshr <2 x i43> %A, <i43 23, i43 23>
+; CHECK-NEXT:    [[C:%.*]] = and <2 x i43> %A, <i43 -8388608, i43 -8388608>
+; CHECK-NEXT:    [[D:%.*]] = mul <2 x i43> [[B]], [[C]]
+; CHECK-NEXT:    ret <2 x i43> [[D]]
+;
+  %B = lshr <2 x i43> %A, <i43 23, i43 23>
+  %C = shl <2 x i43> %B, <i43 23, i43 23>
+  %D = mul <2 x i43> %B, %C
+  ret <2 x i43> %D
+}
+
 define i37 @test25(i37 %tmp.2, i37 %AA) {
 ; CHECK-LABEL: @test25(
 ; CHECK-NEXT:    [[TMP_3:%.*]] = and i37 %tmp.2, -131072
diff --git a/test/Transforms/InstCombine/apint-sub.ll b/test/Transforms/InstCombine/apint-sub.ll
index eb314ce3d1b2..1a4e62ff0d73 100644
--- a/test/Transforms/InstCombine/apint-sub.ll
+++ b/test/Transforms/InstCombine/apint-sub.ll
@@ -50,7 +50,7 @@ define i19 @test5(i19 %A, i19 %Bok, i19 %Cok) {
 define i57 @test6(i57 %A, i57 %B) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:    [[B_NOT:%.*]] = xor i57 %B, -1
-; CHECK-NEXT:    [[D:%.*]] = and i57 %A, [[B_NOT]]
+; CHECK-NEXT:    [[D:%.*]] = and i57 [[B_NOT]], %A
 ; CHECK-NEXT:    ret i57 [[D]]
 ;
   %C = and i57 %A, %B
diff --git a/test/Transforms/InstCombine/assume.ll b/test/Transforms/InstCombine/assume.ll
index 6e690426db99..13fa6339e85a 100644
--- a/test/Transforms/InstCombine/assume.ll
+++ b/test/Transforms/InstCombine/assume.ll
@@ -176,13 +176,13 @@ define i32 @icmp2(i32 %a) #0 {
   ret i32 %lnot.ext
 }
 
-; FIXME: If the 'not' of a condition is known true, then the condition must be false. 
+; If the 'not' of a condition is known true, then the condition must be false.
 
 define i1 @assume_not(i1 %cond) {
 ; CHECK-LABEL: @assume_not(
 ; CHECK-NEXT:    [[NOTCOND:%.*]] = xor i1 [[COND:%.*]], true
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[NOTCOND]])
-; CHECK-NEXT:    ret i1 [[COND]]
+; CHECK-NEXT:    ret i1 false
 ;
   %notcond = xor i1 %cond, true
   call void @llvm.assume(i1 %notcond)
diff --git a/test/Transforms/InstCombine/bitcast-bigendian.ll b/test/Transforms/InstCombine/bitcast-bigendian.ll
index 1a91d11d8aee..e940f0fcec75 100644
--- a/test/Transforms/InstCombine/bitcast-bigendian.ll
+++ b/test/Transforms/InstCombine/bitcast-bigendian.ll
@@ -9,8 +9,8 @@ target triple = "powerpc64-unknown-linux-gnu"
 
 define float @test2(<2 x float> %A, <2 x i32> %B) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> %A, i32 1
-; CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i32> %B to <2 x float>
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[A:%.*]], i32 1
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i32> [[B:%.*]] to <2 x float>
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[BC]], i32 1
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP24]], [[TMP4]]
 ; CHECK-NEXT:    ret float [[ADD]]
@@ -29,8 +29,8 @@ define float @test2(<2 x float> %A, <2 x i32> %B) {
 
 define float @test3(<2 x float> %A, <2 x i64> %B) {
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> %A, i32 0
-; CHECK-NEXT:    [[BC2:%.*]] = bitcast <2 x i64> %B to <4 x float>
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[BC2:%.*]] = bitcast <2 x i64> [[B:%.*]] to <4 x float>
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[BC2]], i32 1
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP24]], [[TMP4]]
 ; CHECK-NEXT:    ret float [[ADD]]
@@ -51,8 +51,8 @@ define float @test3(<2 x float> %A, <2 x i64> %B) {
 
 define <2 x i32> @test4(i32 %A, i32 %B){
 ; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 %B, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %A, i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A:%.*]], i32 1
 ; CHECK-NEXT:    ret <2 x i32> [[TMP2]]
 ;
   %tmp38 = zext i32 %A to i64
@@ -65,8 +65,8 @@ define <2 x i32> @test4(i32 %A, i32 %B){
 
 define <2 x float> @test5(float %A, float %B) {
 ; CHECK-LABEL: @test5(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float %B, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float %A, i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[A:%.*]], i32 1
 ; CHECK-NEXT:    ret <2 x float> [[TMP2]]
 ;
   %tmp37 = bitcast float %A to i32
@@ -81,9 +81,8 @@ define <2 x float> @test5(float %A, float %B) {
 
 define <2 x float> @test6(float %A){
 ; CHECK-LABEL: @test6(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float %A, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float 4.200000e+01, i32 1
-; CHECK-NEXT:    ret <2 x float> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> <float undef, float 4.200000e+01>, float [[A:%.*]], i32 0
+; CHECK-NEXT:    ret <2 x float> [[TMP1]]
 ;
   %tmp23 = bitcast float %A to i32
   %tmp24 = zext i32 %tmp23 to i64
@@ -97,8 +96,8 @@ define <2 x float> @test6(float %A){
 
 define <2 x i32> @xor_bitcast_vec_to_vec(<1 x i64> %a) {
 ; CHECK-LABEL: @xor_bitcast_vec_to_vec(
-; CHECK-NEXT:    [[T21:%.*]] = xor <1 x i64> %a, <i64 4294967298>
-; CHECK-NEXT:    [[T2:%.*]] = bitcast <1 x i64> [[T21]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <1 x i64> [[A:%.*]], <i64 4294967298>
+; CHECK-NEXT:    [[T2:%.*]] = bitcast <1 x i64> [[TMP1]] to <2 x i32>
 ; CHECK-NEXT:    ret <2 x i32> [[T2]]
 ;
   %t1 = bitcast <1 x i64> %a to <2 x i32>
@@ -110,8 +109,8 @@ define <2 x i32> @xor_bitcast_vec_to_vec(<1 x i64> %a) {
 
 define i64 @and_bitcast_vec_to_int(<2 x i32> %a) {
 ; CHECK-LABEL: @and_bitcast_vec_to_int(
-; CHECK-NEXT:    [[T21:%.*]] = and <2 x i32> %a, <i32 0, i32 3>
-; CHECK-NEXT:    [[T2:%.*]] = bitcast <2 x i32> [[T21]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[A:%.*]], <i32 0, i32 3>
+; CHECK-NEXT:    [[T2:%.*]] = bitcast <2 x i32> [[TMP1]] to i64
 ; CHECK-NEXT:    ret i64 [[T2]]
 ;
   %t1 = bitcast <2 x i32> %a to i64
@@ -123,8 +122,8 @@ define i64 @and_bitcast_vec_to_int(<2 x i32> %a) {
 
 define <2 x i32> @or_bitcast_int_to_vec(i64 %a) {
 ; CHECK-LABEL: @or_bitcast_int_to_vec(
-; CHECK-NEXT:    [[T21:%.*]] = or i64 %a, 4294967298
-; CHECK-NEXT:    [[T2:%.*]] = bitcast i64 [[T21]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = or i64 [[A:%.*]], 4294967298
+; CHECK-NEXT:    [[T2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32>
 ; CHECK-NEXT:    ret <2 x i32> [[T2]]
 ;
   %t1 = bitcast i64 %a to <2 x i32>
diff --git a/test/Transforms/InstCombine/bitcast.ll b/test/Transforms/InstCombine/bitcast.ll
index 08f49660f184..2e7f30fee14d 100644
--- a/test/Transforms/InstCombine/bitcast.ll
+++ b/test/Transforms/InstCombine/bitcast.ll
@@ -21,7 +21,7 @@ define i32 @test1(i64 %a) {
 
 define <2 x i32> @xor_two_vector_bitcasts(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: @xor_two_vector_bitcasts(
-; CHECK-NEXT:    [[T31:%.*]] = xor <1 x i64> %a, %b
+; CHECK-NEXT:    [[T31:%.*]] = xor <1 x i64> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[T3:%.*]] = bitcast <1 x i64> [[T31]] to <2 x i32>
 ; CHECK-NEXT:    ret <2 x i32> [[T3]]
 ;
@@ -35,8 +35,8 @@ define <2 x i32> @xor_two_vector_bitcasts(<1 x i64> %a, <1 x i64> %b) {
 
 define <2 x i32> @xor_bitcast_vec_to_vec(<1 x i64> %a) {
 ; CHECK-LABEL: @xor_bitcast_vec_to_vec(
-; CHECK-NEXT:    [[T21:%.*]] = xor <1 x i64> %a, <i64 8589934593>
-; CHECK-NEXT:    [[T2:%.*]] = bitcast <1 x i64> [[T21]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <1 x i64> [[A:%.*]], <i64 8589934593>
+; CHECK-NEXT:    [[T2:%.*]] = bitcast <1 x i64> [[TMP1]] to <2 x i32>
 ; CHECK-NEXT:    ret <2 x i32> [[T2]]
 ;
   %t1 = bitcast <1 x i64> %a to <2 x i32>
@@ -48,8 +48,8 @@ define <2 x i32> @xor_bitcast_vec_to_vec(<1 x i64> %a) {
 
 define i64 @and_bitcast_vec_to_int(<2 x i32> %a) {
 ; CHECK-LABEL: @and_bitcast_vec_to_int(
-; CHECK-NEXT:    [[T21:%.*]] = and <2 x i32> %a, <i32 3, i32 0>
-; CHECK-NEXT:    [[T2:%.*]] = bitcast <2 x i32> [[T21]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[A:%.*]], <i32 3, i32 0>
+; CHECK-NEXT:    [[T2:%.*]] = bitcast <2 x i32> [[TMP1]] to i64
 ; CHECK-NEXT:    ret i64 [[T2]]
 ;
   %t1 = bitcast <2 x i32> %a to i64
@@ -61,8 +61,8 @@ define i64 @and_bitcast_vec_to_int(<2 x i32> %a) {
 
 define <2 x i32> @or_bitcast_int_to_vec(i64 %a) {
 ; CHECK-LABEL: @or_bitcast_int_to_vec(
-; CHECK-NEXT:    [[T21:%.*]] = or i64 %a, 8589934593
-; CHECK-NEXT:    [[T2:%.*]] = bitcast i64 [[T21]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = or i64 [[A:%.*]], 8589934593
+; CHECK-NEXT:    [[T2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32>
 ; CHECK-NEXT:    ret <2 x i32> [[T2]]
 ;
   %t1 = bitcast i64 %a to <2 x i32>
@@ -74,8 +74,8 @@ define <2 x i32> @or_bitcast_int_to_vec(i64 %a) {
 
 define <4 x i32> @bitcasts_and_bitcast(<4 x i32> %a, <8 x i16> %b) {
 ; CHECK-LABEL: @bitcasts_and_bitcast(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> %b to <4 x i32>
-; CHECK-NEXT:    [[BC3:%.*]] = and <4 x i32> [[TMP1]], %a
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[BC3:%.*]] = and <4 x i32> [[TMP1]], [[A:%.*]]
 ; CHECK-NEXT:    ret <4 x i32> [[BC3]]
 ;
   %bc1 = bitcast <4 x i32> %a to <2 x i64>
@@ -91,8 +91,8 @@ define <4 x i32> @bitcasts_and_bitcast(<4 x i32> %a, <8 x i16> %b) {
 
 define <4 x float> @bitcasts_and_bitcast_to_fp(<4 x float> %a, <8 x i16> %b) {
 ; CHECK-LABEL: @bitcasts_and_bitcast_to_fp(
-; CHECK-NEXT:    [[BC1:%.*]] = bitcast <4 x float> %a to <2 x i64>
-; CHECK-NEXT:    [[BC2:%.*]] = bitcast <8 x i16> %b to <2 x i64>
+; CHECK-NEXT:    [[BC1:%.*]] = bitcast <4 x float> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[BC2:%.*]] = bitcast <8 x i16> [[B:%.*]] to <2 x i64>
 ; CHECK-NEXT:    [[AND:%.*]] = and <2 x i64> [[BC2]], [[BC1]]
 ; CHECK-NEXT:    [[BC3:%.*]] = bitcast <2 x i64> [[AND]] to <4 x float>
 ; CHECK-NEXT:    ret <4 x float> [[BC3]]
@@ -108,8 +108,8 @@ define <4 x float> @bitcasts_and_bitcast_to_fp(<4 x float> %a, <8 x i16> %b) {
 
 define i128 @bitcast_or_bitcast(i128 %a, <2 x i64> %b) {
 ; CHECK-LABEL: @bitcast_or_bitcast(
-; CHECK-NEXT:    [[BC1:%.*]] = bitcast i128 %a to <2 x i64>
-; CHECK-NEXT:    [[OR:%.*]] = or <2 x i64> [[BC1]], %b
+; CHECK-NEXT:    [[BC1:%.*]] = bitcast i128 [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i64> [[BC1]], [[B:%.*]]
 ; CHECK-NEXT:    [[BC2:%.*]] = bitcast <2 x i64> [[OR]] to i128
 ; CHECK-NEXT:    ret i128 [[BC2]]
 ;
@@ -123,8 +123,8 @@ define i128 @bitcast_or_bitcast(i128 %a, <2 x i64> %b) {
 
 define <4 x i32> @bitcast_xor_bitcast(<4 x i32> %a, i128 %b) {
 ; CHECK-LABEL: @bitcast_xor_bitcast(
-; CHECK-NEXT:    [[BC1:%.*]] = bitcast <4 x i32> %a to i128
-; CHECK-NEXT:    [[XOR:%.*]] = xor i128 [[BC1]], %b
+; CHECK-NEXT:    [[BC1:%.*]] = bitcast <4 x i32> [[A:%.*]] to i128
+; CHECK-NEXT:    [[XOR:%.*]] = xor i128 [[BC1]], [[B:%.*]]
 ; CHECK-NEXT:    [[BC2:%.*]] = bitcast i128 [[XOR]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[BC2]]
 ;
@@ -138,8 +138,8 @@ define <4 x i32> @bitcast_xor_bitcast(<4 x i32> %a, i128 %b) {
 
 define <4 x float> @bitcast_vector_select(<4 x float> %x, <2 x i64> %y, <4 x i1> %cmp) {
 ; CHECK-LABEL: @bitcast_vector_select(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> %y to <4 x float>
-; CHECK-NEXT:    [[T7:%.*]] = select <4 x i1> %cmp, <4 x float> %x, <4 x float> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[Y:%.*]] to <4 x float>
+; CHECK-NEXT:    [[T7:%.*]] = select <4 x i1> [[CMP:%.*]], <4 x float> [[X:%.*]], <4 x float> [[TMP1]]
 ; CHECK-NEXT:    ret <4 x float> [[T7]]
 ;
   %t4 = bitcast <4 x float> %x to <4 x i32>
@@ -151,8 +151,8 @@ define <4 x float> @bitcast_vector_select(<4 x float> %x, <2 x i64> %y, <4 x i1>
 
 define float @bitcast_scalar_select_of_scalars(float %x, i32 %y, i1 %cmp) {
 ; CHECK-LABEL: @bitcast_scalar_select_of_scalars(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 %y to float
-; CHECK-NEXT:    [[T7:%.*]] = select i1 %cmp, float %x, float [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[Y:%.*]] to float
+; CHECK-NEXT:    [[T7:%.*]] = select i1 [[CMP:%.*]], float [[X:%.*]], float [[TMP1]]
 ; CHECK-NEXT:    ret float [[T7]]
 ;
   %t4 = bitcast float %x to i32
@@ -166,8 +166,8 @@ define float @bitcast_scalar_select_of_scalars(float %x, i32 %y, i1 %cmp) {
 
 define float @bitcast_scalar_select_type_mismatch1(float %x, <4 x i8> %y, i1 %cmp) {
 ; CHECK-LABEL: @bitcast_scalar_select_type_mismatch1(
-; CHECK-NEXT:    [[T4:%.*]] = bitcast float %x to <4 x i8>
-; CHECK-NEXT:    [[T6:%.*]] = select i1 %cmp, <4 x i8> [[T4]], <4 x i8> %y
+; CHECK-NEXT:    [[T4:%.*]] = bitcast float [[X:%.*]] to <4 x i8>
+; CHECK-NEXT:    [[T6:%.*]] = select i1 [[CMP:%.*]], <4 x i8> [[T4]], <4 x i8> [[Y:%.*]]
 ; CHECK-NEXT:    [[T7:%.*]] = bitcast <4 x i8> [[T6]] to float
 ; CHECK-NEXT:    ret float [[T7]]
 ;
@@ -182,8 +182,8 @@ define float @bitcast_scalar_select_type_mismatch1(float %x, <4 x i8> %y, i1 %cm
 
 define <4 x i8> @bitcast_scalar_select_type_mismatch2(<4 x i8> %x, float %y, i1 %cmp) {
 ; CHECK-LABEL: @bitcast_scalar_select_type_mismatch2(
-; CHECK-NEXT:    [[T4:%.*]] = bitcast <4 x i8> %x to float
-; CHECK-NEXT:    [[T6:%.*]] = select i1 %cmp, float [[T4]], float %y
+; CHECK-NEXT:    [[T4:%.*]] = bitcast <4 x i8> [[X:%.*]] to float
+; CHECK-NEXT:    [[T6:%.*]] = select i1 [[CMP:%.*]], float [[T4]], float [[Y:%.*]]
 ; CHECK-NEXT:    [[T7:%.*]] = bitcast float [[T6]] to <4 x i8>
 ; CHECK-NEXT:    ret <4 x i8> [[T7]]
 ;
@@ -195,8 +195,8 @@ define <4 x i8> @bitcast_scalar_select_type_mismatch2(<4 x i8> %x, float %y, i1
 
 define <4 x float> @bitcast_scalar_select_of_vectors(<4 x float> %x, <2 x i64> %y, i1 %cmp) {
 ; CHECK-LABEL: @bitcast_scalar_select_of_vectors(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> %y to <4 x float>
-; CHECK-NEXT:    [[T7:%.*]] = select i1 %cmp, <4 x float> %x, <4 x float> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[Y:%.*]] to <4 x float>
+; CHECK-NEXT:    [[T7:%.*]] = select i1 [[CMP:%.*]], <4 x float> [[X:%.*]], <4 x float> [[TMP1]]
 ; CHECK-NEXT:    ret <4 x float> [[T7]]
 ;
   %t4 = bitcast <4 x float> %x to <4 x i32>
@@ -210,9 +210,9 @@ define <4 x float> @bitcast_scalar_select_of_vectors(<4 x float> %x, <2 x i64> %
 
 define float @bitcast_vector_select_no_fold1(float %x, <2 x i16> %y, <4 x i1> %cmp) {
 ; CHECK-LABEL: @bitcast_vector_select_no_fold1(
-; CHECK-NEXT:    [[T4:%.*]] = bitcast float %x to <4 x i8>
-; CHECK-NEXT:    [[T5:%.*]] = bitcast <2 x i16> %y to <4 x i8>
-; CHECK-NEXT:    [[T6:%.*]] = select <4 x i1> %cmp, <4 x i8> [[T4]], <4 x i8> [[T5]]
+; CHECK-NEXT:    [[T4:%.*]] = bitcast float [[X:%.*]] to <4 x i8>
+; CHECK-NEXT:    [[T5:%.*]] = bitcast <2 x i16> [[Y:%.*]] to <4 x i8>
+; CHECK-NEXT:    [[T6:%.*]] = select <4 x i1> [[CMP:%.*]], <4 x i8> [[T4]], <4 x i8> [[T5]]
 ; CHECK-NEXT:    [[T7:%.*]] = bitcast <4 x i8> [[T6]] to float
 ; CHECK-NEXT:    ret float [[T7]]
 ;
@@ -227,9 +227,9 @@ define float @bitcast_vector_select_no_fold1(float %x, <2 x i16> %y, <4 x i1> %c
 
 define <2 x float> @bitcast_vector_select_no_fold2(<2 x float> %x, <4 x i16> %y, <8 x i1> %cmp) {
 ; CHECK-LABEL: @bitcast_vector_select_no_fold2(
-; CHECK-NEXT:    [[T4:%.*]] = bitcast <2 x float> %x to <8 x i8>
-; CHECK-NEXT:    [[T5:%.*]] = bitcast <4 x i16> %y to <8 x i8>
-; CHECK-NEXT:    [[T6:%.*]] = select <8 x i1> %cmp, <8 x i8> [[T4]], <8 x i8> [[T5]]
+; CHECK-NEXT:    [[T4:%.*]] = bitcast <2 x float> [[X:%.*]] to <8 x i8>
+; CHECK-NEXT:    [[T5:%.*]] = bitcast <4 x i16> [[Y:%.*]] to <8 x i8>
+; CHECK-NEXT:    [[T6:%.*]] = select <8 x i1> [[CMP:%.*]], <8 x i8> [[T4]], <8 x i8> [[T5]]
 ; CHECK-NEXT:    [[T7:%.*]] = bitcast <8 x i8> [[T6]] to <2 x float>
 ; CHECK-NEXT:    ret <2 x float> [[T7]]
 ;
@@ -244,8 +244,8 @@ define <2 x float> @bitcast_vector_select_no_fold2(<2 x float> %x, <4 x i16> %y,
 ; rdar://7892780
 define float @test2(<2 x float> %A, <2 x i32> %B) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> %A, i32 0
-; CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i32> %B to <2 x float>
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i32> [[B:%.*]] to <2 x float>
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[BC]], i32 0
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP24]], [[TMP4]]
 ; CHECK-NEXT:    ret float [[ADD]]
@@ -266,8 +266,8 @@ define float @test2(<2 x float> %A, <2 x i32> %B) {
 ; rdar://7892780
 define float @test3(<2 x float> %A, <2 x i64> %B) {
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> %A, i32 1
-; CHECK-NEXT:    [[BC2:%.*]] = bitcast <2 x i64> %B to <4 x float>
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[A:%.*]], i32 1
+; CHECK-NEXT:    [[BC2:%.*]] = bitcast <2 x i64> [[B:%.*]] to <4 x float>
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[BC2]], i32 2
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP24]], [[TMP4]]
 ; CHECK-NEXT:    ret float [[ADD]]
@@ -290,7 +290,7 @@ define float @test3(<2 x float> %A, <2 x i64> %B) {
 
 define float @bitcast_extelt1(<2 x float> %A) {
 ; CHECK-LABEL: @bitcast_extelt1(
-; CHECK-NEXT:    [[BC2:%.*]] = extractelement <2 x float> %A, i32 0
+; CHECK-NEXT:    [[BC2:%.*]] = extractelement <2 x float> [[A:%.*]], i32 0
 ; CHECK-NEXT:    ret float [[BC2]]
 ;
   %bc1 = bitcast <2 x float> %A to <2 x i32>
@@ -303,7 +303,7 @@ define float @bitcast_extelt1(<2 x float> %A) {
 
 define i64 @bitcast_extelt2(<4 x float> %A) {
 ; CHECK-LABEL: @bitcast_extelt2(
-; CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x float> %A to <2 x i64>
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x float> [[A:%.*]] to <2 x i64>
 ; CHECK-NEXT:    [[BC2:%.*]] = extractelement <2 x i64> [[BC]], i32 1
 ; CHECK-NEXT:    ret i64 [[BC2]]
 ;
@@ -317,7 +317,7 @@ define i64 @bitcast_extelt2(<4 x float> %A) {
 
 define <2 x i32> @bitcast_extelt3(<2 x i32> %A) {
 ; CHECK-LABEL: @bitcast_extelt3(
-; CHECK-NEXT:    [[BC1:%.*]] = bitcast <2 x i32> %A to <1 x i64>
+; CHECK-NEXT:    [[BC1:%.*]] = bitcast <2 x i32> [[A:%.*]] to <1 x i64>
 ; CHECK-NEXT:    [[EXT:%.*]] = extractelement <1 x i64> [[BC1]], i32 0
 ; CHECK-NEXT:    [[BC2:%.*]] = bitcast i64 [[EXT]] to <2 x i32>
 ; CHECK-NEXT:    ret <2 x i32> [[BC2]]
@@ -332,7 +332,7 @@ define <2 x i32> @bitcast_extelt3(<2 x i32> %A) {
 
 define double @bitcast_extelt4(i128 %A) {
 ; CHECK-LABEL: @bitcast_extelt4(
-; CHECK-NEXT:    [[BC:%.*]] = bitcast i128 %A to <2 x double>
+; CHECK-NEXT:    [[BC:%.*]] = bitcast i128 [[A:%.*]] to <2 x double>
 ; CHECK-NEXT:    [[BC2:%.*]] = extractelement <2 x double> [[BC]], i32 0
 ; CHECK-NEXT:    ret double [[BC2]]
 ;
@@ -344,8 +344,8 @@ define double @bitcast_extelt4(i128 %A) {
 
 define <2 x i32> @test4(i32 %A, i32 %B){
 ; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 %A, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %B, i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[B:%.*]], i32 1
 ; CHECK-NEXT:    ret <2 x i32> [[TMP2]]
 ;
   %tmp38 = zext i32 %A to i64
@@ -359,8 +359,8 @@ define <2 x i32> @test4(i32 %A, i32 %B){
 ; rdar://8360454
 define <2 x float> @test5(float %A, float %B) {
 ; CHECK-LABEL: @test5(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float %A, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float %B, i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[B:%.*]], i32 1
 ; CHECK-NEXT:    ret <2 x float> [[TMP2]]
 ;
   %tmp37 = bitcast float %A to i32
@@ -375,7 +375,7 @@ define <2 x float> @test5(float %A, float %B) {
 
 define <2 x float> @test6(float %A){
 ; CHECK-LABEL: @test6(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> <float 4.200000e+01, float undef>, float %A, i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> <float 4.200000e+01, float undef>, float [[A:%.*]], i32 1
 ; CHECK-NEXT:    ret <2 x float> [[TMP1]]
 ;
   %tmp23 = bitcast float %A to i32
@@ -422,7 +422,7 @@ define i32 @All111(i32 %in) {
 
 define <2 x i16> @BitcastInsert(i32 %a) {
 ; CHECK-LABEL: @BitcastInsert(
-; CHECK-NEXT:    [[R:%.*]] = bitcast i32 %a to <2 x i16>
+; CHECK-NEXT:    [[R:%.*]] = bitcast i32 [[A:%.*]] to <2 x i16>
 ; CHECK-NEXT:    ret <2 x i16> [[R]]
 ;
   %v = insertelement <1 x i32> undef, i32 %a, i32 0
@@ -433,7 +433,7 @@ define <2 x i16> @BitcastInsert(i32 %a) {
 ; PR17293
 define <2 x i64> @test7(<2 x i8*>* %arg) nounwind {
 ; CHECK-LABEL: @test7(
-; CHECK-NEXT:    [[CAST:%.*]] = bitcast <2 x i8*>* %arg to <2 x i64>*
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <2 x i8*>* [[ARG:%.*]] to <2 x i64>*
 ; CHECK-NEXT:    [[LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[CAST]], align 16
 ; CHECK-NEXT:    ret <2 x i64> [[LOAD]]
 ;
@@ -452,25 +452,24 @@ define i8 @test8() {
 
 @g = internal unnamed_addr global i32 undef
 
-; CHECK-LABEL: @constant_fold_vector_to_double(
-; CHECK: store volatile double 1.000000e+00,
-; CHECK: store volatile double 1.000000e+00,
-; CHECK: store volatile double 1.000000e+00,
-; CHECK: store volatile double 1.000000e+00,
-
-; CHECK: store volatile double 0xFFFFFFFFFFFFFFFF,
-; CHECK: store volatile double 0x162E000004D2,
-
-; CHECK: store volatile double bitcast (<2 x i32> <i32 1234, i32 ptrtoint (i32* @g to i32)> to double),
-; CHECK: store volatile double 0x400000003F800000,
-
-; CHECK: store volatile double 0.000000e+00,
-; CHECK: store volatile double 0.000000e+00,
-; CHECK: store volatile double 0.000000e+00,
-; CHECK: store volatile double 0.000000e+00,
-; CHECK: store volatile double 0.000000e+00,
-; CHECK: store volatile double 0.000000e+00,
 define void @constant_fold_vector_to_double() {
+; CHECK-LABEL: @constant_fold_vector_to_double(
+; CHECK-NEXT:    store volatile double 1.000000e+00, double* undef, align 8
+; CHECK-NEXT:    store volatile double 1.000000e+00, double* undef, align 8
+; CHECK-NEXT:    store volatile double 1.000000e+00, double* undef, align 8
+; CHECK-NEXT:    store volatile double 1.000000e+00, double* undef, align 8
+; CHECK-NEXT:    store volatile double 0xFFFFFFFFFFFFFFFF, double* undef, align 8
+; CHECK-NEXT:    store volatile double 0x162E000004D2, double* undef, align 8
+; CHECK-NEXT:    store volatile double bitcast (<2 x i32> <i32 1234, i32 ptrtoint (i32* @g to i32)> to double), double* undef, align 8
+; CHECK-NEXT:    store volatile double 0x400000003F800000, double* undef, align 8
+; CHECK-NEXT:    store volatile double 0.000000e+00, double* undef, align 8
+; CHECK-NEXT:    store volatile double 0.000000e+00, double* undef, align 8
+; CHECK-NEXT:    store volatile double 0.000000e+00, double* undef, align 8
+; CHECK-NEXT:    store volatile double 0.000000e+00, double* undef, align 8
+; CHECK-NEXT:    store volatile double 0.000000e+00, double* undef, align 8
+; CHECK-NEXT:    store volatile double 0.000000e+00, double* undef, align 8
+; CHECK-NEXT:    ret void
+;
   store volatile double bitcast (<1 x i64> <i64 4607182418800017408> to double), double* undef
   store volatile double bitcast (<2 x i32> <i32 0, i32 1072693248> to double), double* undef
   store volatile double bitcast (<4 x i16> <i16 0, i16 0, i16 0, i16 16368> to double), double* undef
@@ -491,12 +490,14 @@ define void @constant_fold_vector_to_double() {
   ret void
 }
 
-; CHECK-LABEL: @constant_fold_vector_to_float(
-; CHECK: store volatile float 1.000000e+00,
-; CHECK: store volatile float 1.000000e+00,
-; CHECK: store volatile float 1.000000e+00,
-; CHECK: store volatile float 1.000000e+00,
 define void @constant_fold_vector_to_float() {
+; CHECK-LABEL: @constant_fold_vector_to_float(
+; CHECK-NEXT:    store volatile float 1.000000e+00, float* undef, align 4
+; CHECK-NEXT:    store volatile float 1.000000e+00, float* undef, align 4
+; CHECK-NEXT:    store volatile float 1.000000e+00, float* undef, align 4
+; CHECK-NEXT:    store volatile float 1.000000e+00, float* undef, align 4
+; CHECK-NEXT:    ret void
+;
   store volatile float bitcast (<1 x i32> <i32 1065353216> to float), float* undef
   store volatile float bitcast (<2 x i16> <i16 0, i16 16256> to float), float* undef
   store volatile float bitcast (<4 x i8> <i8 0, i8 0, i8 128, i8 63> to float), float* undef
@@ -505,10 +506,12 @@ define void @constant_fold_vector_to_float() {
   ret void
 }
 
-; CHECK-LABEL: @constant_fold_vector_to_half(
-; CHECK: store volatile half 0xH4000,
-; CHECK: store volatile half 0xH4000,
 define void @constant_fold_vector_to_half() {
+; CHECK-LABEL: @constant_fold_vector_to_half(
+; CHECK-NEXT:    store volatile half 0xH4000, half* undef, align 2
+; CHECK-NEXT:    store volatile half 0xH4000, half* undef, align 2
+; CHECK-NEXT:    ret void
+;
   store volatile half bitcast (<2 x i8> <i8 0, i8 64> to half), half* undef
   store volatile half bitcast (<4 x i4> <i4 0, i4 0, i4 0, i4 4> to half), half* undef
   ret void
diff --git a/test/Transforms/InstCombine/bitreverse-fold.ll b/test/Transforms/InstCombine/bitreverse-fold.ll
index ecdfbc8cb5f9..b798ad33b3f0 100644
--- a/test/Transforms/InstCombine/bitreverse-fold.ll
+++ b/test/Transforms/InstCombine/bitreverse-fold.ll
@@ -37,6 +37,13 @@ define i32 @reverse_neg1_i32() {
   ret i32 %x
 }
 
+; CHECK-LABEL: @reverse_undef_i32(
+; CHECK-NEXT: ret i32 undef
+define i32 @reverse_undef_i32() {
+  %x = call i32 @llvm.bitreverse.i32(i32 undef)
+  ret i32 %x
+}
+
 ; CHECK-LABEL: @reverse_false_i1(
 ; CHECK-NEXT: ret i1 false
 define i1 @reverse_false_i1() {
@@ -51,6 +58,13 @@ define i1 @reverse_true_i1() {
   ret i1 %x
 }
 
+; CHECK-LABEL: @reverse_undef_i1(
+; CHECK-NEXT: ret i1 undef
+define i1 @reverse_undef_i1() {
+  %x = call i1 @llvm.bitreverse.i1(i1 undef)
+  ret i1 %x
+}
+
 ; CHECK-LABEL: @reverse_false_v2i1(
 ; CHECK-NEXT: ret <2 x i1> zeroinitializer
 define <2 x i1> @reverse_false_v2i1() {
diff --git a/test/Transforms/InstCombine/bitreverse-known-bits.ll b/test/Transforms/InstCombine/bitreverse-known-bits.ll
new file mode 100644
index 000000000000..cd1523a3b06b
--- /dev/null
+++ b/test/Transforms/InstCombine/bitreverse-known-bits.ll
@@ -0,0 +1,51 @@
+; RUN: opt < %s -S -instcombine | FileCheck %s
+
+declare i8 @llvm.bitreverse.i8(i8)
+declare i32 @llvm.bitreverse.i32(i32)
+
+; CHECK-LABEL: @test1
+; CHECK: ret i1 true
+define i1 @test1(i32 %arg) {
+  %a = or i32 %arg, 4294901760
+  %b = call i32 @llvm.bitreverse.i32(i32 %a)
+  %and = and i32 %b, 65535
+  %res = icmp eq i32 %and, 65535
+  ret i1 %res
+}
+
+; CHECK-LABEL: @test2
+; CHECK: ret i1 true
+define i1 @test2(i32 %arg) {
+  %a = or i32 %arg, 1
+  %b = call i32 @llvm.bitreverse.i32(i32 %a)
+  %c = and i32 %b, 2147483648
+  %d = call i32 @llvm.bitreverse.i32(i32 %c)
+  %res = icmp eq i32 %d, 1
+  ret i1 %res
+}
+
+; CHECK-LABEL: @test3
+; CHECK: ret i1 false
+define i1 @test3(i32 %arg) {
+  %a = or i32 %arg, 65536
+  %b = call i32 @llvm.bitreverse.i32(i32 %a)
+  %and = and i32 %b, 32768
+  %res = icmp eq i32 %and, 0
+  ret i1 %res
+}
+
+; CHECK-LABEL: @add_bitreverse
+; Make sure we process range metadata on bitreverse
+define i8 @add_bitreverse(i8 %a) {
+  %b = and i8 %a, 252
+  ; known bits for the bitreverse will say the result is in the range [0, 64)
+  ; but the metadata says [0, 16). So make sure the range metadata wins.
+  ;    add %reverse, 1111 0000
+  ; should become
+  ;    or  %reverse, 1111 0000
+  %reverse = call i8 @llvm.bitreverse.i8(i8 %b), !range !1
+  %c = add i8 %reverse, -16
+; CHECK: or i8 %reverse, -16
+  ret i8 %c
+}
+!1 = !{i8 0, i8 16}
diff --git a/test/Transforms/InstCombine/bswap-fold.ll b/test/Transforms/InstCombine/bswap-fold.ll
index edf9572f1e11..91678a91962a 100644
--- a/test/Transforms/InstCombine/bswap-fold.ll
+++ b/test/Transforms/InstCombine/bswap-fold.ll
@@ -1,68 +1,75 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-define i1 @test1(i16 %tmp2) {
-; CHECK-LABEL: @test1
-; CHECK-NEXT:  %tmp = icmp eq i16 %tmp2, 256
-; CHECK-NEXT:  ret i1 %tmp
-        %tmp10 = call i16 @llvm.bswap.i16( i16 %tmp2 )
-        %tmp = icmp eq i16 %tmp10, 1
-        ret i1 %tmp
+define i1 @test1(i16 %t) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i16 %t, 256
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+  %tmp1 = call i16 @llvm.bswap.i16( i16 %t )
+  %tmp2 = icmp eq i16 %tmp1, 1
+  ret i1 %tmp2
 }
 
 define i1 @test2(i32 %tmp) {
-; CHECK-LABEL: @test2
-; CHECK-NEXT:  %tmp.upgrd.1 = icmp eq i32 %tmp, 16777216
-; CHECK-NEXT:  ret i1 %tmp.upgrd.1
-        %tmp34 = tail call i32 @llvm.bswap.i32( i32 %tmp )
-        %tmp.upgrd.1 = icmp eq i32 %tmp34, 1
-        ret i1 %tmp.upgrd.1
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[TMP_UPGRD_1:%.*]] = icmp eq i32 %tmp, 16777216
+; CHECK-NEXT:    ret i1 [[TMP_UPGRD_1]]
+;
+  %tmp34 = tail call i32 @llvm.bswap.i32( i32 %tmp )
+  %tmp.upgrd.1 = icmp eq i32 %tmp34, 1
+  ret i1 %tmp.upgrd.1
 }
 
 define i1 @test3(i64 %tmp) {
-; CHECK-LABEL: @test3
-; CHECK-NEXT:  %tmp.upgrd.2 = icmp eq i64 %tmp, 72057594037927936
-; CHECK-NEXT:  ret i1 %tmp.upgrd.2
-        %tmp34 = tail call i64 @llvm.bswap.i64( i64 %tmp )
-        %tmp.upgrd.2 = icmp eq i64 %tmp34, 1
-        ret i1 %tmp.upgrd.2
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[TMP_UPGRD_2:%.*]] = icmp eq i64 %tmp, 72057594037927936
+; CHECK-NEXT:    ret i1 [[TMP_UPGRD_2]]
+;
+  %tmp34 = tail call i64 @llvm.bswap.i64( i64 %tmp )
+  %tmp.upgrd.2 = icmp eq i64 %tmp34, 1
+  ret i1 %tmp.upgrd.2
 }
 
 ; rdar://5992453
 ; A & 255
 define i32 @test4(i32 %a) nounwind  {
-; CHECK-LABEL: @test4
-; CHECK-NEXT:  %tmp2 = and i32 %a, 255
-; CHECK-NEXT:  ret i32 %tmp2
-	%tmp2 = tail call i32 @llvm.bswap.i32( i32 %a )
-	%tmp4 = lshr i32 %tmp2, 24
-	ret i32 %tmp4
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 %a, 255
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %tmp2 = tail call i32 @llvm.bswap.i32( i32 %a )
+  %tmp4 = lshr i32 %tmp2, 24
+  ret i32 %tmp4
 }
 
 ; A
 define i32 @test5(i32 %a) nounwind {
-; CHECK-LABEL: @test5
-; CHECK-NEXT:  ret i32 %a
-	%tmp2 = tail call i32 @llvm.bswap.i32( i32 %a )
-	%tmp4 = tail call i32 @llvm.bswap.i32( i32 %tmp2 )
-	ret i32 %tmp4
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    ret i32 %a
+;
+  %tmp2 = tail call i32 @llvm.bswap.i32( i32 %a )
+  %tmp4 = tail call i32 @llvm.bswap.i32( i32 %tmp2 )
+  ret i32 %tmp4
 }
 
 ; a >> 24
 define i32 @test6(i32 %a) nounwind {
-; CHECK-LABEL: @test6
-; CHECK-NEXT:  %tmp2 = lshr i32 %a, 24
-; CHECK-NEXT:  ret i32 %tmp2
-	%tmp2 = tail call i32 @llvm.bswap.i32( i32 %a )
-	%tmp4 = and i32 %tmp2, 255
-	ret i32 %tmp4
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 %a, 24
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %tmp2 = tail call i32 @llvm.bswap.i32( i32 %a )
+  %tmp4 = and i32 %tmp2, 255
+  ret i32 %tmp4
 }
 
 ; PR5284
 define i16 @test7(i32 %A) {
-; CHECK-LABEL: @test7
-; CHECK-NEXT:  %1 = lshr i32 %A, 16
-; CHECK-NEXT:  %D = trunc i32 %1 to i16
-; CHECK-NEXT:  ret i16 %D
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 %A, 16
+; CHECK-NEXT:    [[D:%.*]] = trunc i32 [[TMP1]] to i16
+; CHECK-NEXT:    ret i16 [[D]]
+;
   %B = tail call i32 @llvm.bswap.i32(i32 %A) nounwind
   %C = trunc i32 %B to i16
   %D = tail call i16 @llvm.bswap.i16(i16 %C) nounwind
@@ -70,11 +77,12 @@ define i16 @test7(i32 %A) {
 }
 
 define i16 @test8(i64 %A) {
-; CHECK-LABEL: @test8
-; CHECK-NEXT:  %1 = lshr i64 %A, 48
-; CHECK-NEXT:  %D = trunc i64 %1 to i16
-; CHECK-NEXT:  ret i16 %D
-  %B = tail call i64 @llvm.bswap.i64(i64 %A) nounwind 
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 %A, 48
+; CHECK-NEXT:    [[D:%.*]] = trunc i64 [[TMP1]] to i16
+; CHECK-NEXT:    ret i16 [[D]]
+;
+  %B = tail call i64 @llvm.bswap.i64(i64 %A) nounwind
   %C = trunc i64 %B to i16
   %D = tail call i16 @llvm.bswap.i16(i16 %C) nounwind
   ret i16 %D
@@ -82,8 +90,9 @@ define i16 @test8(i64 %A) {
 
 ; Misc: Fold bswap(undef) to undef.
 define i64 @foo() {
-; CHECK-LABEL: @foo
-; CHECK-NEXT: ret i64 undef
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    ret i64 undef
+;
   %a = call i64 @llvm.bswap.i64(i64 undef)
   ret i64 %a
 }
@@ -92,20 +101,22 @@ define i64 @foo() {
 ; Fold: OP( BSWAP(x), BSWAP(y) ) -> BSWAP( OP(x, y) )
 ; Fold: OP( BSWAP(x), CONSTANT ) -> BSWAP( OP(x, BSWAP(CONSTANT) ) )
 define i16 @bs_and16i(i16 %a, i16 %b) #0 {
-; CHECK-LABEL: @bs_and16i
-; CHECK-NEXT:  %1 = and i16 %a, 4391
-; CHECK-NEXT:  %2 = call i16 @llvm.bswap.i16(i16 %1)
-; CHECK-NEXT:  ret i16 %2
+; CHECK-LABEL: @bs_and16i(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i16 %a, 4391
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    ret i16 [[TMP2]]
+;
   %1 = tail call i16 @llvm.bswap.i16(i16 %a)
   %2 = and i16 %1, 10001
   ret i16 %2
 }
 
 define i16 @bs_and16(i16 %a, i16 %b) #0 {
-; CHECK-LABEL: @bs_and16
-; CHECK-NEXT:  %1 = and i16 %a, %b
-; CHECK-NEXT:  %2 = call i16 @llvm.bswap.i16(i16 %1)
-; CHECK-NEXT:  ret i16 %2
+; CHECK-LABEL: @bs_and16(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i16 %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    ret i16 [[TMP2]]
+;
   %tmp1 = tail call i16 @llvm.bswap.i16(i16 %a)
   %tmp2 = tail call i16 @llvm.bswap.i16(i16 %b)
   %tmp3 = and i16 %tmp1, %tmp2
@@ -113,10 +124,11 @@ define i16 @bs_and16(i16 %a, i16 %b) #0 {
 }
 
 define i16 @bs_or16(i16 %a, i16 %b) #0 {
-; CHECK-LABEL: @bs_or16
-; CHECK-NEXT:  %1 = or i16 %a, %b
-; CHECK-NEXT:  %2 = call i16 @llvm.bswap.i16(i16 %1)
-; CHECK-NEXT:  ret i16 %2
+; CHECK-LABEL: @bs_or16(
+; CHECK-NEXT:    [[TMP1:%.*]] = or i16 %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    ret i16 [[TMP2]]
+;
   %tmp1 = tail call i16 @llvm.bswap.i16(i16 %a)
   %tmp2 = tail call i16 @llvm.bswap.i16(i16 %b)
   %tmp3 = or i16 %tmp1, %tmp2
@@ -124,10 +136,11 @@ define i16 @bs_or16(i16 %a, i16 %b) #0 {
 }
 
 define i16 @bs_xor16(i16 %a, i16 %b) #0 {
-; CHECK-LABEL: @bs_xor16
-; CHECK-NEXT:  %1 = xor i16 %a, %b
-; CHECK-NEXT:  %2 = call i16 @llvm.bswap.i16(i16 %1)
-; CHECK-NEXT:  ret i16 %2
+; CHECK-LABEL: @bs_xor16(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i16 %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    ret i16 [[TMP2]]
+;
   %tmp1 = tail call i16 @llvm.bswap.i16(i16 %a)
   %tmp2 = tail call i16 @llvm.bswap.i16(i16 %b)
   %tmp3 = xor i16 %tmp1, %tmp2
@@ -135,20 +148,22 @@ define i16 @bs_xor16(i16 %a, i16 %b) #0 {
 }
 
 define i32 @bs_and32i(i32 %a, i32 %b) #0 {
-; CHECK-LABEL: @bs_and32i
-; CHECK-NEXT:  %1 = and i32 %a, -1585053440
-; CHECK-NEXT:  %2 = call i32 @llvm.bswap.i32(i32 %1)
-; CHECK-NEXT:  ret i32 %2
+; CHECK-LABEL: @bs_and32i(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 %a, -1585053440
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
   %tmp1 = tail call i32 @llvm.bswap.i32(i32 %a)
   %tmp2 = and i32 %tmp1, 100001
   ret i32 %tmp2
 }
 
 define i32 @bs_and32(i32 %a, i32 %b) #0 {
-; CHECK-LABEL: @bs_and32
-; CHECK-NEXT:  %1 = and i32 %a, %b
-; CHECK-NEXT:  %2 = call i32 @llvm.bswap.i32(i32 %1)
-; CHECK-NEXT:  ret i32 %2
+; CHECK-LABEL: @bs_and32(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
   %tmp1 = tail call i32 @llvm.bswap.i32(i32 %a)
   %tmp2 = tail call i32 @llvm.bswap.i32(i32 %b)
   %tmp3 = and i32 %tmp1, %tmp2
@@ -156,10 +171,11 @@ define i32 @bs_and32(i32 %a, i32 %b) #0 {
 }
 
 define i32 @bs_or32(i32 %a, i32 %b) #0 {
-; CHECK-LABEL: @bs_or32
-; CHECK-NEXT:  %1 = or i32 %a, %b
-; CHECK-NEXT:  %2 = call i32 @llvm.bswap.i32(i32 %1)
-; CHECK-NEXT:  ret i32 %2
+; CHECK-LABEL: @bs_or32(
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
   %tmp1 = tail call i32 @llvm.bswap.i32(i32 %a)
   %tmp2 = tail call i32 @llvm.bswap.i32(i32 %b)
   %tmp3 = or i32 %tmp1, %tmp2
@@ -167,10 +183,11 @@ define i32 @bs_or32(i32 %a, i32 %b) #0 {
 }
 
 define i32 @bs_xor32(i32 %a, i32 %b) #0 {
-; CHECK-LABEL: @bs_xor32
-; CHECK-NEXT:  %1 = xor i32 %a, %b
-; CHECK-NEXT:  %2 = call i32 @llvm.bswap.i32(i32 %1)
-; CHECK-NEXT:  ret i32 %2
+; CHECK-LABEL: @bs_xor32(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
   %tmp1 = tail call i32 @llvm.bswap.i32(i32 %a)
   %tmp2 = tail call i32 @llvm.bswap.i32(i32 %b)
   %tmp3 = xor i32 %tmp1, %tmp2
@@ -178,20 +195,22 @@ define i32 @bs_xor32(i32 %a, i32 %b) #0 {
 }
 
 define i64 @bs_and64i(i64 %a, i64 %b) #0 {
-; CHECK-LABEL: @bs_and64i
-; CHECK-NEXT:  %1 = and i64 %a, 129085117527228416
-; CHECK-NEXT:  %2 = call i64 @llvm.bswap.i64(i64 %1)
-; CHECK-NEXT:  ret i64 %2
+; CHECK-LABEL: @bs_and64i(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 %a, 129085117527228416
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
   %tmp1 = tail call i64 @llvm.bswap.i64(i64 %a)
   %tmp2 = and i64 %tmp1, 1000000001
   ret i64 %tmp2
 }
 
 define i64 @bs_and64(i64 %a, i64 %b) #0 {
-; CHECK-LABEL: @bs_and64
-; CHECK-NEXT:  %1 = and i64 %a, %b
-; CHECK-NEXT:  %2 = call i64 @llvm.bswap.i64(i64 %1)
-; CHECK-NEXT:  ret i64 %2
+; CHECK-LABEL: @bs_and64(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
   %tmp1 = tail call i64 @llvm.bswap.i64(i64 %a)
   %tmp2 = tail call i64 @llvm.bswap.i64(i64 %b)
   %tmp3 = and i64 %tmp1, %tmp2
@@ -199,10 +218,11 @@ define i64 @bs_and64(i64 %a, i64 %b) #0 {
 }
 
 define i64 @bs_or64(i64 %a, i64 %b) #0 {
-; CHECK-LABEL: @bs_or64
-; CHECK-NEXT:  %1 = or i64 %a, %b
-; CHECK-NEXT:  %2 = call i64 @llvm.bswap.i64(i64 %1)
-; CHECK-NEXT:  ret i64 %2
+; CHECK-LABEL: @bs_or64(
+; CHECK-NEXT:    [[TMP1:%.*]] = or i64 %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
   %tmp1 = tail call i64 @llvm.bswap.i64(i64 %a)
   %tmp2 = tail call i64 @llvm.bswap.i64(i64 %b)
   %tmp3 = or i64 %tmp1, %tmp2
@@ -210,10 +230,11 @@ define i64 @bs_or64(i64 %a, i64 %b) #0 {
 }
 
 define i64 @bs_xor64(i64 %a, i64 %b) #0 {
-; CHECK-LABEL: @bs_xor64
-; CHECK-NEXT:  %1 = xor i64 %a, %b
-; CHECK-NEXT:  %2 = call i64 @llvm.bswap.i64(i64 %1)
-; CHECK-NEXT:  ret i64 %2
+; CHECK-LABEL: @bs_xor64(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i64 %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
   %tmp1 = tail call i64 @llvm.bswap.i64(i64 %a)
   %tmp2 = tail call i64 @llvm.bswap.i64(i64 %b)
   %tmp3 = xor i64 %tmp1, %tmp2
diff --git a/test/Transforms/InstCombine/builtin-object-size-offset.ll b/test/Transforms/InstCombine/builtin-object-size-offset.ll
index 7ab24a9acd94..248cf644df89 100644
--- a/test/Transforms/InstCombine/builtin-object-size-offset.ll
+++ b/test/Transforms/InstCombine/builtin-object-size-offset.ll
@@ -26,25 +26,25 @@ entry:
   %Big = alloca [20 x i8], align 16
   %Small = alloca [10 x i8], align 1
   %0 = getelementptr inbounds [20 x i8], [20 x i8]* %Big, i64 0, i64 0
-  call void @llvm.lifetime.start(i64 20, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 20, i8* %0)
   %1 = getelementptr inbounds [10 x i8], [10 x i8]* %Small, i64 0, i64 0
-  call void @llvm.lifetime.start(i64 10, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 10, i8* %1)
   %tobool = icmp ne i32 %N, 0
   %add.ptr = getelementptr inbounds [20 x i8], [20 x i8]* %Big, i64 0, i64 10
   %cond = select i1 %tobool, i8* %add.ptr, i8* %1
   %2 = call i64 @llvm.objectsize.i64.p0i8(i8* %cond, i1 false)
   %conv = trunc i64 %2 to i32
-  call void @llvm.lifetime.end(i64 10, i8* %1)
-  call void @llvm.lifetime.end(i64 20, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 10, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 20, i8* %0)
   ret i32 %conv
 ; CHECK: ret i32 10 
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
 
 declare i64 @llvm.objectsize.i64.p0i8(i8*, i1)
 
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
 define void @foo() {
 entry:
diff --git a/test/Transforms/InstCombine/builtin-object-size-ptr.ll b/test/Transforms/InstCombine/builtin-object-size-ptr.ll
index b38513999dc1..ada3fc167026 100644
--- a/test/Transforms/InstCombine/builtin-object-size-ptr.ll
+++ b/test/Transforms/InstCombine/builtin-object-size-ptr.ll
@@ -16,19 +16,19 @@ define i32 @foo() #0 {
 entry:
   %var = alloca %struct.V, align 4
   %0 = bitcast %struct.V* %var to i8*
-  call void @llvm.lifetime.start(i64 28, i8* %0) #3
+  call void @llvm.lifetime.start.p0i8(i64 28, i8* %0) #3
   %buf1 = getelementptr inbounds %struct.V, %struct.V* %var, i32 0, i32 0
   %arrayidx = getelementptr inbounds [10 x i8], [10 x i8]* %buf1, i64 0, i64 1
   %1 = call i64 @llvm.objectsize.i64.p0i8(i8* %arrayidx, i1 false)
   %conv = trunc i64 %1 to i32
-  call void @llvm.lifetime.end(i64 28, i8* %0) #3
+  call void @llvm.lifetime.end.p0i8(i64 28, i8* %0) #3
   ret i32 %conv
 ; CHECK: ret i32 27
 ; CHECK-NOT: ret i32 -1
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 
 declare i64 @llvm.objectsize.i64.p0i8(i8*, i1) #2
 
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
diff --git a/test/Transforms/InstCombine/call-guard.ll b/test/Transforms/InstCombine/call-guard.ll
new file mode 100644
index 000000000000..9664467f914b
--- /dev/null
+++ b/test/Transforms/InstCombine/call-guard.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+declare void @llvm.experimental.guard(i1, ...)
+
+define void @test_guard_adjacent_same_cond(i1 %A) {
+; CHECK-LABEL: @test_guard_adjacent_same_cond(
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %A) [ "deopt"() ]
+; CHECK-NEXT:    ret void
+  call void(i1, ...) @llvm.experimental.guard( i1 %A )[ "deopt"() ]
+  call void(i1, ...) @llvm.experimental.guard( i1 %A )[ "deopt"() ]
+  call void(i1, ...) @llvm.experimental.guard( i1 %A )[ "deopt"() ]
+  call void(i1, ...) @llvm.experimental.guard( i1 %A )[ "deopt"() ]
+  call void(i1, ...) @llvm.experimental.guard( i1 %A )[ "deopt"() ]
+  call void(i1, ...) @llvm.experimental.guard( i1 %A )[ "deopt"() ]
+  call void(i1, ...) @llvm.experimental.guard( i1 %A )[ "deopt"() ]
+  call void(i1, ...) @llvm.experimental.guard( i1 %A )[ "deopt"() ]
+  call void(i1, ...) @llvm.experimental.guard( i1 %A )[ "deopt"() ]
+  call void(i1, ...) @llvm.experimental.guard( i1 %A )[ "deopt"() ]
+  ret void
+}
+
+define void @test_guard_adjacent_diff_cond(i1 %A, i1 %B, i1 %C) {
+; CHECK-LABEL: @test_guard_adjacent_diff_cond(
+; CHECK-NEXT:    %1 = and i1 %A, %B
+; CHECK-NEXT:    %2 = and i1 %1, %C
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %2, i32 123) [ "deopt"() ]
+; CHECK-NEXT:    ret void
+  call void(i1, ...) @llvm.experimental.guard( i1 %A, i32 123 )[ "deopt"() ]
+  call void(i1, ...) @llvm.experimental.guard( i1 %B, i32 456 )[ "deopt"() ]
+  call void(i1, ...) @llvm.experimental.guard( i1 %C, i32 789 )[ "deopt"() ]
+  ret void
+}
diff --git a/test/Transforms/InstCombine/call_nonnull_arg.ll b/test/Transforms/InstCombine/call_nonnull_arg.ll
index c502aa05731e..8127f4734fcd 100644
--- a/test/Transforms/InstCombine/call_nonnull_arg.ll
+++ b/test/Transforms/InstCombine/call_nonnull_arg.ll
@@ -31,7 +31,7 @@ dead:
   unreachable
 }
 
-; FIXME: The nonnull attribute in the 'bar' declaration could be 
+; The nonnull attribute in the 'bar' declaration is 
 ; propagated to the parameters of the 'baz' callsite. 
 
 declare void @bar(i8*, i8* nonnull)
@@ -40,7 +40,7 @@ declare void @baz(i8*, i8*)
 define void @deduce_nonnull_from_another_call(i8* %a, i8* %b) {
 ; CHECK-LABEL: @deduce_nonnull_from_another_call(
 ; CHECK-NEXT:    call void @bar(i8* %a, i8* %b)
-; CHECK-NEXT:    call void @baz(i8* %b, i8* %b)
+; CHECK-NEXT:    call void @baz(i8* nonnull %b, i8* nonnull %b)
 ; CHECK-NEXT:    ret void
 ;
   call void @bar(i8* %a, i8* %b)
diff --git a/test/Transforms/InstCombine/cast-call-combine-prof.ll b/test/Transforms/InstCombine/cast-call-combine-prof.ll
new file mode 100644
index 000000000000..05b71b666e24
--- /dev/null
+++ b/test/Transforms/InstCombine/cast-call-combine-prof.ll
@@ -0,0 +1,53 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; Check that instcombine preserves !prof metadata when removing function
+; prototype casts.
+
+declare i32 @__gxx_personality_v0(...)
+declare void @__cxa_call_unexpected(i8*)
+declare void @foo(i16* %a)
+
+; CHECK-LABEL: @test_call()
+; CHECK: call void @foo(i16* null), !prof ![[PROF:[0-9]+]]
+define void @test_call() {
+  call void bitcast (void (i16*)* @foo to void (i8*)*) (i8* null), !prof !0
+  ret void
+}
+
+; CHECK-LABEL: @test_invoke()
+; CHECK: invoke void @foo(i16* null)
+; CHECK-NEXT: to label %done unwind label %lpad, !prof ![[PROF]]
+define void @test_invoke() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+  invoke void bitcast (void (i16*)* @foo to void (i8*)*) (i8* null)
+          to label %done unwind label %lpad, !prof !0
+
+done:
+  ret void
+
+lpad:
+  %lp = landingpad { i8*, i32 }
+          filter [0 x i8*] zeroinitializer
+  %ehptr = extractvalue { i8*, i32 } %lp, 0
+  tail call void @__cxa_call_unexpected(i8* %ehptr) noreturn nounwind
+  unreachable
+}
+
+; CHECK: ![[PROF]] = !{!"branch_weights", i32 2000}
+!0 = !{!"VP", i32 0, i64 2000, i64 -3913987384944532146, i64 2000}
+
+!llvm.module.flags = !{!1}
+
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 10000}
+!5 = !{!"MaxCount", i64 1000}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 1000}
+!8 = !{!"NumCounts", i64 3}
+!9 = !{!"NumFunctions", i64 3}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 1000, i32 1}
+!13 = !{i32 999000, i64 1000, i32 1}
+!14 = !{i32 999999, i64 1, i32 2}
diff --git a/test/Transforms/InstCombine/compare-alloca.ll b/test/Transforms/InstCombine/compare-alloca.ll
index ca24da191779..414a07825f2f 100644
--- a/test/Transforms/InstCombine/compare-alloca.ll
+++ b/test/Transforms/InstCombine/compare-alloca.ll
@@ -72,15 +72,15 @@ define i1 @alloca_argument_compare_escaped_through_store(i64* %arg, i64** %ptr)
   ; CHECK: ret i1 %cmp
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 define i1 @alloca_argument_compare_benign_instrs(i8* %arg) {
   %alloc = alloca i8
-  call void @llvm.lifetime.start(i64 1, i8* %alloc)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %alloc)
   %cmp = icmp eq i8* %arg, %alloc
   %x = load i8, i8* %arg
   store i8 %x, i8* %alloc
-  call void @llvm.lifetime.end(i64 1, i8* %alloc)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %alloc)
   ret i1 %cmp
   ; CHECK-LABEL: alloca_argument_compare_benign_instrs
   ; CHECK: ret i1 false
diff --git a/test/Transforms/InstCombine/compare-unescaped.ll b/test/Transforms/InstCombine/compare-unescaped.ll
index 0e512aa28911..d15fc2fd4495 100644
--- a/test/Transforms/InstCombine/compare-unescaped.ll
+++ b/test/Transforms/InstCombine/compare-unescaped.ll
@@ -144,7 +144,7 @@ chk2:
   ret i8* %n
 ; CHECK-LABEL: compare_ret_escape
 ; CHECK: %cmp = icmp eq i8* %n, %c
-; CHECK: %cmp2 = icmp eq i32* %bc, %lgp
+; CHECK: %cmp2 = icmp eq i32* %lgp, %bc
 }
 
 ; The malloc call for %m cannot be elided since it is used in the call to function f.
diff --git a/test/Transforms/InstCombine/consecutive-fences.ll b/test/Transforms/InstCombine/consecutive-fences.ll
new file mode 100644
index 000000000000..6f1c41277386
--- /dev/null
+++ b/test/Transforms/InstCombine/consecutive-fences.ll
@@ -0,0 +1,47 @@
+; RUN: opt -instcombine -S %s | FileCheck %s
+
+; Make sure we collapse the fences in this case
+
+; CHECK-LABEL: define void @tinkywinky
+; CHECK-NEXT:   fence seq_cst
+; CHECK-NEXT:   fence singlethread acquire
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+
+define void @tinkywinky() {
+  fence seq_cst
+  fence seq_cst
+  fence seq_cst
+  fence singlethread acquire
+  fence singlethread acquire
+  fence singlethread acquire
+  ret void
+}
+
+; CHECK-LABEL: define void @dipsy
+; CHECK-NEXT:   fence seq_cst
+; CHECK-NEXT:   fence singlethread seq_cst
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+
+define void @dipsy() {
+  fence seq_cst
+  fence singlethread seq_cst
+  ret void
+}
+
+; CHECK-LABEL: define void @patatino
+; CHECK-NEXT:   fence acquire
+; CHECK-NEXT:   fence seq_cst
+; CHECK-NEXT:   fence acquire
+; CHECK-NEXT:   fence seq_cst
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+
+define void @patatino() {
+  fence acquire
+  fence seq_cst
+  fence acquire
+  fence seq_cst
+  ret void
+}
diff --git a/test/Transforms/InstCombine/constant-fold-math.ll b/test/Transforms/InstCombine/constant-fold-math.ll
index ce8d337c08bf..50cd6070896e 100644
--- a/test/Transforms/InstCombine/constant-fold-math.ll
+++ b/test/Transforms/InstCombine/constant-fold-math.ll
@@ -45,12 +45,4 @@ define double @constant_fold_fmuladd_f64() #0 {
   ret double %x
 }
 
-; The sqrt intrinsic is undefined for negative inputs besides -0.0.
-; CHECK-LABEL: @bad_sqrt
-; CHECK-NEXT: ret double undef
-define double @bad_sqrt() {
-  %x = call double @llvm.sqrt.f64(double -2.000000e+00)
-  ret double %x
-}
-
 attributes #0 = { nounwind readnone }
diff --git a/test/Transforms/InstCombine/convergent.ll b/test/Transforms/InstCombine/convergent.ll
index d4484cf4567e..9b9ae6f5352c 100644
--- a/test/Transforms/InstCombine/convergent.ll
+++ b/test/Transforms/InstCombine/convergent.ll
@@ -27,7 +27,7 @@ define i32 @no_extern() {
 }
 
 define i32 @indirect_call(i32 ()* %f) {
-  ; CHECK call i32 %f() [[CONVERGENT_ATTR]]
+  ; CHECK: call i32 %f() [[CONVERGENT_ATTR]]
   %a = call i32 %f() convergent
   ret i32 %a
 }
diff --git a/test/Transforms/InstCombine/deadcode.ll b/test/Transforms/InstCombine/deadcode.ll
index 8fe673d8c9c0..c5fa58babdbc 100644
--- a/test/Transforms/InstCombine/deadcode.ll
+++ b/test/Transforms/InstCombine/deadcode.ll
@@ -22,12 +22,12 @@ define i32* @test2(i32 %width) {
 
 declare i8* @llvm.stacksave()
 
-declare void @llvm.lifetime.start(i64, i8*)
-declare void @llvm.lifetime.end(i64, i8*)
+declare void @llvm.lifetime.start.p0i8(i64, i8*)
+declare void @llvm.lifetime.end.p0i8(i64, i8*)
 
 define void @test3() {
-  call void @llvm.lifetime.start(i64 -1, i8* undef)
-  call void @llvm.lifetime.end(i64 -1, i8* undef)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* undef)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* undef)
   ret void
 }
 
diff --git a/test/Transforms/InstCombine/debuginfo-dce.ll b/test/Transforms/InstCombine/debuginfo-dce.ll
new file mode 100644
index 000000000000..e23aef7334d5
--- /dev/null
+++ b/test/Transforms/InstCombine/debuginfo-dce.ll
@@ -0,0 +1,106 @@
+; RUN: opt -instcombine %s -S -o - | FileCheck %s
+; Verify that the eliminated instructions (bitcast, gep, load) are salvaged into
+; a DIExpression.
+;
+; Originally created from the following C source and then heavily isolated/reduced.
+;
+; struct entry {
+;   struct entry *next;
+; };
+; void scan(struct entry *queue, struct entry *end)
+; {
+;   struct entry *entry;
+;   for (entry = (struct entry *)((char *)(queue->next) - 8);
+;        &entry->next == end;
+;        entry = (struct entry *)((char *)(entry->next) - 8)) {
+;   }
+; }
+
+; ModuleID = '<stdin>'
+source_filename = "test.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+%struct.entry = type { %struct.entry* }
+
+; Function Attrs: nounwind ssp uwtable
+define void @salvage_load(%struct.entry** %queue) local_unnamed_addr #0 !dbg !14 {
+entry:
+  %im_not_dead = alloca %struct.entry*
+  %0 = load %struct.entry*, %struct.entry** %queue, align 8, !dbg !19
+  %1 = load %struct.entry*, %struct.entry** %queue, align 8, !dbg !19
+  call void @llvm.dbg.value(metadata %struct.entry* %1, i64 0, metadata !18, metadata !20), !dbg !19
+; CHECK: define void @salvage_load
+; CHECK-NEXT: entry:
+; CHECK-NEXT: call void @llvm.dbg.value(metadata %struct.entry** %queue, i64 0,
+; CHECK-SAME:                           metadata ![[LOAD_EXPR:[0-9]+]])
+  store %struct.entry* %1, %struct.entry** %im_not_dead, align 8
+  ret void, !dbg !21
+}
+
+; Function Attrs: nounwind ssp uwtable
+define void @salvage_bitcast(%struct.entry* %queue) local_unnamed_addr #0 !dbg !14 {
+entry:
+  %im_not_dead = alloca i8*
+  %0 = bitcast %struct.entry* %queue to i8*, !dbg !19
+  %1 = bitcast %struct.entry* %queue to i8*, !dbg !19
+  call void @llvm.dbg.value(metadata i8* %1, i64 0, metadata !18, metadata !20), !dbg !19
+; CHECK: define void @salvage_bitcast
+; CHECK-NEXT: entry:
+; CHECK-NEXT: call void @llvm.dbg.value(metadata %struct.entry* %queue, i64 0,
+; CHECK-SAME:                           metadata ![[BITCAST_EXPR:[0-9]+]])
+  store i8* %1, i8** %im_not_dead, align 8
+  ret void, !dbg !21
+}
+
+; Function Attrs: nounwind ssp uwtable
+define void @salvage_gep(%struct.entry* %queue, %struct.entry* %end) local_unnamed_addr #0 !dbg !14 {
+entry:
+  %im_not_dead = alloca %struct.entry**
+  %0 = getelementptr inbounds %struct.entry, %struct.entry* %queue, i32 -1, i32 0, !dbg !19
+  %1 = getelementptr inbounds %struct.entry, %struct.entry* %queue, i32 -1, i32 0, !dbg !19
+  call void @llvm.dbg.value(metadata %struct.entry** %1, i64 0, metadata !18, metadata !20), !dbg !19
+; CHECK: define void @salvage_gep
+; CHECK-NEXT: entry:
+; CHECK-NEXT: call void @llvm.dbg.value(metadata %struct.entry* %queue, i64 0,
+; CHECK-SAME:                           metadata ![[GEP_EXPR:[0-9]+]])
+  store %struct.entry** %1, %struct.entry*** %im_not_dead, align 8
+  ret void, !dbg !21
+}
+
+; CHECK: ![[LOAD_EXPR]] = !DIExpression(DW_OP_deref, DW_OP_plus, 0)
+; CHECK: ![[BITCAST_EXPR]] = !DIExpression(DW_OP_plus, 0)
+; CHECK: ![[GEP_EXPR]] = !DIExpression(DW_OP_minus, 8, DW_OP_plus, 0)
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind ssp uwtable }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!10, !11, !12}
+!llvm.ident = !{!13}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 (trunk 297628) (llvm/trunk 297643)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3)
+!1 = !DIFile(filename: "test.c", directory: "/")
+!2 = !{}
+!3 = !{!4, !8}
+!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 64)
+!5 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "entry", file: !1, line: 1, size: 64, elements: !6)
+!6 = !{!7}
+!7 = !DIDerivedType(tag: DW_TAG_member, name: "next", scope: !5, file: !1, line: 2, baseType: !4, size: 64)
+!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64)
+!9 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!10 = !{i32 2, !"Dwarf Version", i32 4}
+!11 = !{i32 2, !"Debug Info Version", i32 3}
+!12 = !{i32 1, !"PIC Level", i32 2}
+!13 = !{!"clang version 5.0.0 (trunk 297628) (llvm/trunk 297643)"}
+!14 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !17)
+!15 = !DISubroutineType(types: !16)
+!16 = !{null, !4, !4}
+!17 = !{!18}
+!18 = !DILocalVariable(name: "entry", scope: !14, file: !1, line: 6, type: !4)
+!19 = !DILocation(line: 6, column: 17, scope: !14)
+!20 = !DIExpression(DW_OP_plus, 0)
+!21 = !DILocation(line: 11, column: 1, scope: !14)
diff --git a/test/Transforms/InstCombine/double-float-shrink-2.ll b/test/Transforms/InstCombine/double-float-shrink-2.ll
index 7f6df92c96c5..4813614f26cb 100644
--- a/test/Transforms/InstCombine/double-float-shrink-2.ll
+++ b/test/Transforms/InstCombine/double-float-shrink-2.ll
@@ -1,28 +1,9 @@
-; RUN: opt < %s -instcombine -S -mtriple "i386-pc-linux" | FileCheck -check-prefix=DO-SIMPLIFY %s
-; RUN: opt < %s -instcombine -S -mtriple "i386-pc-win32" | FileCheck -check-prefix=DONT-SIMPLIFY %s
-; RUN: opt < %s -instcombine -S -mtriple "x86_64-pc-win32" | FileCheck -check-prefix=C89-SIMPLIFY %s
-; RUN: opt < %s -instcombine -S -mtriple "i386-pc-mingw32" | FileCheck -check-prefix=DO-SIMPLIFY %s
-; RUN: opt < %s -instcombine -S -mtriple "x86_64-pc-mingw32" | FileCheck -check-prefix=DO-SIMPLIFY %s
-; RUN: opt < %s -instcombine -S -mtriple "sparc-sun-solaris" | FileCheck -check-prefix=DO-SIMPLIFY %s
-
-; DO-SIMPLIFY: call float @floorf(
-; DO-SIMPLIFY: call float @ceilf(
-; DO-SIMPLIFY: call float @roundf(
-; DO-SIMPLIFY: call float @nearbyintf(
-; DO-SIMPLIFY: call float @truncf(
-; DO-SIMPLIFY: call float @fabsf(
-
-; C89-SIMPLIFY: call float @floorf(
-; C89-SIMPLIFY: call float @ceilf(
-; C89-SIMPLIFY: call double @round(
-; C89-SIMPLIFY: call double @nearbyint(
-
-; DONT-SIMPLIFY: call double @floor(
-; DONT-SIMPLIFY: call double @ceil(
-; DONT-SIMPLIFY: call double @round(
-; DONT-SIMPLIFY: call double @nearbyint(
-; DONT-SIMPLIFY: call double @trunc(
-; DONT-SIMPLIFY: call double @fabs(
+; RUN: opt < %s -instcombine -S -mtriple "i386-pc-linux" | FileCheck -check-prefix=DO-SIMPLIFY -check-prefix=ALL %s
+; RUN: opt < %s -instcombine -S -mtriple "i386-pc-win32" | FileCheck -check-prefix=DONT-SIMPLIFY -check-prefix=ALL %s
+; RUN: opt < %s -instcombine -S -mtriple "x86_64-pc-win32" | FileCheck -check-prefix=C89-SIMPLIFY -check-prefix=ALL %s
+; RUN: opt < %s -instcombine -S -mtriple "i386-pc-mingw32" | FileCheck -check-prefix=DO-SIMPLIFY -check-prefix=ALL %s
+; RUN: opt < %s -instcombine -S -mtriple "x86_64-pc-mingw32" | FileCheck -check-prefix=DO-SIMPLIFY -check-prefix=ALL %s
+; RUN: opt < %s -instcombine -S -mtriple "sparc-sun-solaris" | FileCheck -check-prefix=DO-SIMPLIFY -check-prefix=ALL %s
 
 declare double @floor(double)
 declare double @ceil(double)
@@ -31,7 +12,18 @@ declare double @nearbyint(double)
 declare double @trunc(double)
 declare double @fabs(double)
 
-define float @test_floor(float %C) {
+declare double @llvm.floor.f64(double)
+declare double @llvm.ceil.f64(double)
+declare double @llvm.round.f64(double)
+declare double @llvm.nearbyint.f64(double)
+declare double @llvm.trunc.f64(double)
+declare double @llvm.fabs.f64(double)
+
+; ALL-LABEL: @test_shrink_libcall_floor(
+; DO-SIMPLIFY: call float @llvm.floor.f32(
+; C89-SIMPLIFY: call float @llvm.floor.f32(
+; DONT-SIMPLIFY: call float @llvm.floor.f32(
+define float @test_shrink_libcall_floor(float %C) {
   %D = fpext float %C to double
   ; --> floorf
   %E = call double @floor(double %D)
@@ -39,7 +31,11 @@ define float @test_floor(float %C) {
   ret float %F
 }
 
-define float @test_ceil(float %C) {
+; ALL-LABEL: @test_shrink_libcall_ceil(
+; DO-SIMPLIFY: call float @llvm.ceil.f32(
+; C89-SIMPLIFY: call float @llvm.ceil.f32(
+; DONT-SIMPLIFY: call float @llvm.ceil.f32(
+define float @test_shrink_libcall_ceil(float %C) {
   %D = fpext float %C to double
   ; --> ceilf
   %E = call double @ceil(double %D)
@@ -47,7 +43,11 @@ define float @test_ceil(float %C) {
   ret float %F
 }
 
-define float @test_round(float %C) {
+; ALL-LABEL: @test_shrink_libcall_round(
+; DO-SIMPLIFY: call float @llvm.round.f32(
+; C89-SIMPLIFY: call double @round(
+; DONT-SIMPLIFY: call double @round(
+define float @test_shrink_libcall_round(float %C) {
   %D = fpext float %C to double
   ; --> roundf
   %E = call double @round(double %D)
@@ -55,7 +55,11 @@ define float @test_round(float %C) {
   ret float %F
 }
 
-define float @test_nearbyint(float %C) {
+; ALL-LABEL: @test_shrink_libcall_nearbyint(
+; DO-SIMPLIFY: call float @llvm.nearbyint.f32(
+; C89-SIMPLIFY: call double @nearbyint(
+; DONT-SIMPLIFY: call double @nearbyint(
+define float @test_shrink_libcall_nearbyint(float %C) {
   %D = fpext float %C to double
   ; --> nearbyintf
   %E = call double @nearbyint(double %D)
@@ -63,7 +67,10 @@ define float @test_nearbyint(float %C) {
   ret float %F
 }
 
-define float @test_trunc(float %C) {
+; ALL-LABEL: @test_shrink_libcall_trunc(
+; DO-SIMPLIFY: call float @llvm.trunc.f32(
+; DONT-SIMPLIFY: call double @trunc(
+define float @test_shrink_libcall_trunc(float %C) {
   %D = fpext float %C to double
   ; --> truncf
   %E = call double @trunc(double %D)
@@ -71,10 +78,386 @@ define float @test_trunc(float %C) {
   ret float %F
 }
 
-define float @test_fabs(float %C) {
+; ALL-LABEL: @test_shrink_libcall_fabs(
+; DO-SIMPLIFY: call float @llvm.fabs.f32(
+
+; This is replaced with the intrinsic, which does the right thing on
+; all platforms.
+; DONT-SIMPLIFY: call float @llvm.fabs.f32(
+define float @test_shrink_libcall_fabs(float %C) {
   %D = fpext float %C to double
   ; --> fabsf
   %E = call double @fabs(double %D)
   %F = fptrunc double %E to float
   ret float %F
 }
+
+; Make sure fast math flags are preserved
+; ALL-LABEL: @test_shrink_libcall_fabs_fast(
+; DO-SIMPLIFY: call fast float @llvm.fabs.f32(
+define float @test_shrink_libcall_fabs_fast(float %C) {
+  %D = fpext float %C to double
+  ; --> fabsf
+  %E = call fast double @fabs(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_floor(
+; ALL: call float @llvm.floor.f32(
+define float @test_shrink_intrin_floor(float %C) {
+  %D = fpext float %C to double
+  ; --> floorf
+  %E = call double @llvm.floor.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_ceil(
+; ALL: call float @llvm.ceil.f32(
+define float @test_shrink_intrin_ceil(float %C) {
+  %D = fpext float %C to double
+  ; --> ceilf
+  %E = call double @llvm.ceil.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_round(
+; ALL: call float @llvm.round.f32(
+define float @test_shrink_intrin_round(float %C) {
+  %D = fpext float %C to double
+  ; --> roundf
+  %E = call double @llvm.round.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_nearbyint(
+; ALL: call float @llvm.nearbyint.f32(
+define float @test_shrink_intrin_nearbyint(float %C) {
+  %D = fpext float %C to double
+  ; --> nearbyintf
+  %E = call double @llvm.nearbyint.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_trunc(
+; ALL-SIMPLIFY: call float @llvm.trunc.f32(
+define float @test_shrink_intrin_trunc(float %C) {
+  %D = fpext float %C to double
+  %E = call double @llvm.trunc.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_fabs(
+; ALL: call float @llvm.fabs.f32(
+define float @test_shrink_intrin_fabs(float %C) {
+  %D = fpext float %C to double
+  %E = call double @llvm.fabs.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; Make sure fast math flags are preserved
+; ALL-LABEL: @test_shrink_intrin_fabs_fast(
+; ALL: call fast float @llvm.fabs.f32(
+define float @test_shrink_intrin_fabs_fast(float %C) {
+  %D = fpext float %C to double
+  %E = call fast double @llvm.fabs.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_no_shrink_intrin_floor(
+; ALL: call double @llvm.floor.f64(
+define float @test_no_shrink_intrin_floor(double %D) {
+  %E = call double @llvm.floor.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_no_shrink_intrin_ceil(
+; ALL: call double @llvm.ceil.f64(
+define float @test_no_shrink_intrin_ceil(double %D) {
+  %E = call double @llvm.ceil.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_no_shrink_intrin_round(
+; ALL: call double @llvm.round.f64(
+define float @test_no_shrink_intrin_round(double %D) {
+  %E = call double @llvm.round.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_no_shrink_intrin_nearbyint(
+; ALL: call double @llvm.nearbyint.f64(
+define float @test_no_shrink_intrin_nearbyint(double %D) {
+  %E = call double @llvm.nearbyint.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_no_shrink_intrin_trunc(
+; ALL-SIMPLIFY: call double @llvm.trunc.f64(
+define float @test_no_shrink_intrin_trunc(double %D) {
+  %E = call double @llvm.trunc.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_fabs_double_src(
+; ALL: call float @llvm.fabs.f32(
+define float @test_shrink_intrin_fabs_double_src(double %D) {
+  %E = call double @llvm.fabs.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; Make sure fast math flags are preserved
+; ALL-LABEL: @test_shrink_intrin_fabs_fast_double_src(
+; ALL: call fast float @llvm.fabs.f32(
+define float @test_shrink_intrin_fabs_fast_double_src(double %D) {
+  %E = call fast double @llvm.fabs.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_float_convertible_constant_intrin_floor(
+; ALL: ret float 2.000000e+00
+define float @test_shrink_float_convertible_constant_intrin_floor() {
+  %E = call double @llvm.floor.f64(double 2.1)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_float_convertible_constant_intrin_ceil(
+; ALL: ret float 3.000000e+00
+define float @test_shrink_float_convertible_constant_intrin_ceil() {
+  %E = call double @llvm.ceil.f64(double 2.1)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_float_convertible_constant_intrin_round(
+; ALL: ret float 2.000000e+00
+define float @test_shrink_float_convertible_constant_intrin_round() {
+  %E = call double @llvm.round.f64(double 2.1)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_float_convertible_constant_intrin_nearbyint(
+; ALL: ret float 2.000000e+00
+define float @test_shrink_float_convertible_constant_intrin_nearbyint() {
+  %E = call double @llvm.nearbyint.f64(double 2.1)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_float_convertible_constant_intrin_trunc(
+; ALL: ret float 2.000000e+00
+define float @test_shrink_float_convertible_constant_intrin_trunc() {
+  %E = call double @llvm.trunc.f64(double 2.1)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_float_convertible_constant_intrin_fabs(
+; ALL: ret float 0x4000CCCCC0000000
+define float @test_shrink_float_convertible_constant_intrin_fabs() {
+  %E = call double @llvm.fabs.f64(double 2.1)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; Make sure fast math flags are preserved
+; ALL-LABEL: @test_shrink_float_convertible_constant_intrin_fabs_fast(
+; ALL: ret float 0x4000CCCCC0000000
+define float @test_shrink_float_convertible_constant_intrin_fabs_fast() {
+  %E = call fast double @llvm.fabs.f64(double 2.1)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_no_shrink_mismatched_type_intrin_floor(
+; ALL-NEXT: %E = call double @llvm.floor.f64(double %D)
+; ALL-NEXT: %F = fptrunc double %E to half
+; ALL-NEXT: ret half %F
+define half @test_no_shrink_mismatched_type_intrin_floor(double %D) {
+  %E = call double @llvm.floor.f64(double %D)
+  %F = fptrunc double %E to half
+  ret half %F
+}
+
+; ALL-LABEL: @test_no_shrink_mismatched_type_intrin_ceil(
+; ALL-NEXT: %E = call double @llvm.ceil.f64(double %D)
+; ALL-NEXT: %F = fptrunc double %E to half
+; ALL-NEXT: ret half %F
+define half @test_no_shrink_mismatched_type_intrin_ceil(double %D) {
+  %E = call double @llvm.ceil.f64(double %D)
+  %F = fptrunc double %E to half
+  ret half %F
+}
+
+; ALL-LABEL: @test_no_shrink_mismatched_type_intrin_round(
+; ALL-NEXT: %E = call double @llvm.round.f64(double %D)
+; ALL-NEXT: %F = fptrunc double %E to half
+; ALL-NEXT: ret half %F
+define half @test_no_shrink_mismatched_type_intrin_round(double %D) {
+  %E = call double @llvm.round.f64(double %D)
+  %F = fptrunc double %E to half
+  ret half %F
+}
+
+; ALL-LABEL: @test_no_shrink_mismatched_type_intrin_nearbyint(
+; ALL-NEXT: %E = call double @llvm.nearbyint.f64(double %D)
+; ALL-NEXT: %F = fptrunc double %E to half
+; ALL-NEXT: ret half %F
+define half @test_no_shrink_mismatched_type_intrin_nearbyint(double %D) {
+  %E = call double @llvm.nearbyint.f64(double %D)
+  %F = fptrunc double %E to half
+  ret half %F
+}
+
+; ALL-LABEL: @test_no_shrink_mismatched_type_intrin_trunc(
+; ALL-NEXT: %E = call double @llvm.trunc.f64(double %D)
+; ALL-NEXT: %F = fptrunc double %E to half
+; ALL-NEXT: ret half %F
+define half @test_no_shrink_mismatched_type_intrin_trunc(double %D) {
+  %E = call double @llvm.trunc.f64(double %D)
+  %F = fptrunc double %E to half
+  ret half %F
+}
+
+; ALL-LABEL: @test_shrink_mismatched_type_intrin_fabs_double_src(
+; ALL-NEXT: %1 = fptrunc double %D to half
+; ALL-NEXT: %F = call half @llvm.fabs.f16(half %1)
+; ALL-NEXT: ret half %F
+define half @test_shrink_mismatched_type_intrin_fabs_double_src(double %D) {
+  %E = call double @llvm.fabs.f64(double %D)
+  %F = fptrunc double %E to half
+  ret half %F
+}
+
+; Make sure fast math flags are preserved
+; ALL-LABEL: @test_mismatched_type_intrin_fabs_fast_double_src(
+; ALL-NEXT: %1 = fptrunc double %D to half
+; ALL-NEXT: %F = call fast half @llvm.fabs.f16(half %1)
+; ALL-NEXT: ret half %F
+define half @test_mismatched_type_intrin_fabs_fast_double_src(double %D) {
+  %E = call fast double @llvm.fabs.f64(double %D)
+  %F = fptrunc double %E to half
+  ret half %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_floor_fp16_src(
+; ALL-NEXT: %E = call half @llvm.floor.f16(half %C)
+; ALL-NEXT: %1 = fpext half %E to double
+; ALL-NEXT: %F = fptrunc double %1 to float
+define float @test_shrink_intrin_floor_fp16_src(half %C) {
+  %D = fpext half %C to double
+  %E = call double @llvm.floor.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_ceil_fp16_src(
+; ALL-NEXT: %E = call half @llvm.ceil.f16(half %C)
+; ALL-NEXT: %1 = fpext half %E to double
+; ALL-NEXT: %F = fptrunc double %1 to float
+; ALL-NEXT: ret float %F
+define float @test_shrink_intrin_ceil_fp16_src(half %C) {
+  %D = fpext half %C to double
+  %E = call double @llvm.ceil.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_round_fp16_src(
+; ALL-NEXT: %E = call half @llvm.round.f16(half %C)
+; ALL-NEXT: %1 = fpext half %E to double
+; ALL-NEXT: %F = fptrunc double %1 to float
+; ALL-NEXT: ret float %F
+define float @test_shrink_intrin_round_fp16_src(half %C) {
+  %D = fpext half %C to double
+  %E = call double @llvm.round.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_nearbyint_fp16_src(
+; ALL-NEXT: %E = call half @llvm.nearbyint.f16(half %C)
+; ALL-NEXT: %1 = fpext half %E to double
+; ALL-NEXT: %F = fptrunc double %1 to float
+; ALL-NEXT: ret float %F
+define float @test_shrink_intrin_nearbyint_fp16_src(half %C) {
+  %D = fpext half %C to double
+  %E = call double @llvm.nearbyint.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_trunc_fp16_src(
+; ALL-NEXT: %E = call half @llvm.trunc.f16(half %C)
+; ALL-NEXT: %1 = fpext half %E to double
+; ALL-NEXT: %F = fptrunc double %1 to float
+; ALL-NEXT: ret float %F
+define float @test_shrink_intrin_trunc_fp16_src(half %C) {
+  %D = fpext half %C to double
+  %E = call double @llvm.trunc.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_fabs_fp16_src(
+; ALL-NEXT: %E = call half @llvm.fabs.f16(half %C)
+; ALL-NEXT: %1 = fpext half %E to double
+; ALL-NEXT: %F = fptrunc double %1 to float
+; ALL-NEXT: ret float %F
+define float @test_shrink_intrin_fabs_fp16_src(half %C) {
+  %D = fpext half %C to double
+  %E = call double @llvm.fabs.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; Make sure fast math flags are preserved
+; ALL-LABEL: @test_shrink_intrin_fabs_fast_fp16_src(
+; ALL-NEXT: %E = call fast half @llvm.fabs.f16(half %C)
+; ALL-NEXT: %1 = fpext half %E to double
+; ALL-NEXT: %F = fptrunc double %1 to float
+; ALL-NEXT: ret float %F
+define float @test_shrink_intrin_fabs_fast_fp16_src(half %C) {
+  %D = fpext half %C to double
+  %E = call fast double @llvm.fabs.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_no_shrink_intrin_floor_multi_use_fpext(
+; ALL: %D = fpext half %C to double
+; ALL: call double @llvm.floor.f64
+define float @test_no_shrink_intrin_floor_multi_use_fpext(half %C) {
+  %D = fpext half %C to double
+  store volatile double %D, double* undef
+  %E = call double @llvm.floor.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_no_shrink_intrin_fabs_multi_use_fpext(
+; ALL: %D = fpext half %C to double
+; ALL: call double @llvm.fabs.f64
+define float @test_no_shrink_intrin_fabs_multi_use_fpext(half %C) {
+  %D = fpext half %C to double
+  store volatile double %D, double* undef
+  %E = call double @llvm.fabs.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
diff --git a/test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll b/test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll
new file mode 100644
index 000000000000..107440f10a5a
--- /dev/null
+++ b/test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll
@@ -0,0 +1,92 @@
+; RUN: opt -instcombine -unfold-element-atomic-memcpy-max-elements=8 -S < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Test basic unfolding
+define void @test1(i8* %Src, i8* %Dst) {
+; CHECK-LABEL: test1
+; CHECK-NOT: llvm.memcpy.element.atomic
+
+; CHECK-DAG: %memcpy_unfold.src_casted = bitcast i8* %Src to i32*
+; CHECK-DAG: %memcpy_unfold.dst_casted = bitcast i8* %Dst to i32*
+
+; CHECK-DAG: [[VAL1:%[^\s]+]] =  load atomic i32, i32* %memcpy_unfold.src_casted unordered, align 4
+; CHECK-DAG: store atomic i32 [[VAL1]], i32* %memcpy_unfold.dst_casted unordered, align 8
+
+; CHECK-DAG: [[VAL2:%[^\s]+]] =  load atomic i32, i32* %{{[^\s]+}} unordered, align 4
+; CHECK-DAG: store atomic i32 [[VAL2]], i32* %{{[^\s]+}} unordered, align 4
+
+; CHECK-DAG: [[VAL3:%[^\s]+]] =  load atomic i32, i32* %{{[^\s]+}} unordered, align 4
+; CHECK-DAG: store atomic i32 [[VAL3]], i32* %{{[^\s]+}} unordered, align 4
+
+; CHECK-DAG: [[VAL4:%[^\s]+]] =  load atomic i32, i32* %{{[^\s]+}} unordered, align 4
+; CHECK-DAG: store atomic i32 [[VAL4]], i32* %{{[^\s]+}} unordered, align 4
+entry:
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %Dst, i8* align 8 %Src, i64 4, i32 4)
+  ret void
+}
+
+; Test that we don't unfold too much
+define void @test2(i8* %Src, i8* %Dst) {
+; CHECK-LABEL: test2
+
+; CHECK-NOT: load
+; CHECK-NOT: store
+; CHECK: llvm.memcpy.element.atomic
+entry:
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %Dst, i8* align 4 %Src, i64 1000, i32 4)
+  ret void
+}
+
+; Test that we will not unfold into non native integers
+define void @test3(i8* %Src, i8* %Dst) {
+; CHECK-LABEL: test3
+
+; CHECK-NOT: load
+; CHECK-NOT: store
+; CHECK: llvm.memcpy.element.atomic
+entry:
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 4, i32 64)
+  ret void
+}
+
+; Test that we will eliminate redundant bitcasts
+define void @test4(i64* %Src, i64* %Dst) {
+; CHECK-LABEL: test4
+; CHECK-NOT: llvm.memcpy.element.atomic
+
+; CHECK-NOT: bitcast
+
+; CHECK-DAG: [[VAL1:%[^\s]+]] =  load atomic i64, i64* %Src unordered, align 16
+; CHECK-DAG: store atomic i64 [[VAL1]], i64* %Dst unordered, align 16
+
+; CHECK-DAG: [[SRC_ADDR2:%[^ ]+]] = getelementptr i64, i64* %Src, i64 1
+; CHECK-DAG: [[DST_ADDR2:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 1
+; CHECK-DAG: [[VAL2:%[^\s]+]] =  load atomic i64, i64* [[SRC_ADDR2]] unordered, align 8
+; CHECK-DAG: store atomic i64 [[VAL2]], i64* [[DST_ADDR2]] unordered, align 8
+
+; CHECK-DAG: [[SRC_ADDR3:%[^ ]+]] = getelementptr i64, i64* %Src, i64 2
+; CHECK-DAG: [[DST_ADDR3:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 2
+; CHECK-DAG: [[VAL3:%[^ ]+]] =  load atomic i64, i64* [[SRC_ADDR3]] unordered, align 8
+; CHECK-DAG: store atomic i64 [[VAL3]], i64* [[DST_ADDR3]] unordered, align 8
+
+; CHECK-DAG: [[SRC_ADDR4:%[^ ]+]] = getelementptr i64, i64* %Src, i64 3
+; CHECK-DAG: [[DST_ADDR4:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 3
+; CHECK-DAG: [[VAL4:%[^ ]+]] =  load atomic i64, i64* [[SRC_ADDR4]] unordered, align 8
+; CHECK-DAG: store atomic i64 [[VAL4]], i64* [[DST_ADDR4]] unordered, align 8
+entry:
+  %Src.casted = bitcast i64* %Src to i8*
+  %Dst.casted = bitcast i64* %Dst to i8*
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 16 %Dst.casted, i8* align 16 %Src.casted, i64 4, i32 8)
+  ret void
+}
+
+define void @test5(i8* %Src, i8* %Dst) {
+; CHECK-LABEL: test5
+
+; CHECK-NOT: llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 0, i32 64)
+entry:
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 0, i32 64)
+  ret void
+}
+
+declare void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* nocapture, i8* nocapture, i64, i32)
diff --git a/test/Transforms/InstCombine/exact.ll b/test/Transforms/InstCombine/exact.ll
index 436d5081c7aa..96b6fd689964 100644
--- a/test/Transforms/InstCombine/exact.ll
+++ b/test/Transforms/InstCombine/exact.ll
@@ -99,12 +99,12 @@ define i64 @ashr1(i64 %X) {
   ret i64 %B
 }
 
-; FIXME: The ashr should be exact (like it is in the preceding test).
+; The vector ashr should be exact (like it is in the preceding test).
 
 define <2 x i64> @ashr1_vec(<2 x i64> %X) {
 ; CHECK-LABEL: @ashr1_vec(
 ; CHECK-NEXT:    [[A:%.*]] = shl <2 x i64> %X, <i64 8, i64 8>
-; CHECK-NEXT:    [[B:%.*]] = ashr <2 x i64> [[A]], <i64 2, i64 2>
+; CHECK-NEXT:    [[B:%.*]] = ashr exact <2 x i64> [[A]], <i64 2, i64 2>
 ; CHECK-NEXT:    ret <2 x i64> [[B]]
 ;
   %A = shl <2 x i64> %X, <i64 8, i64 8>
diff --git a/test/Transforms/InstCombine/fabs-libcall.ll b/test/Transforms/InstCombine/fabs-libcall.ll
new file mode 100644
index 000000000000..5733badfa8f9
--- /dev/null
+++ b/test/Transforms/InstCombine/fabs-libcall.ll
@@ -0,0 +1,21 @@
+; RUN: opt -S -mtriple=i686-apple-macosx -instcombine %s | FileCheck %s
+
+declare x86_fp80 @fabsl(x86_fp80)
+
+; CHECK-LABEL: @replace_fabs_call_f80(
+; CHECK-NEXT: %fabsl = call x86_fp80 @llvm.fabs.f80(x86_fp80 %x)
+; CHECK-NEXT: ret x86_fp80 %fabsl
+define x86_fp80 @replace_fabs_call_f80(x86_fp80 %x) {
+  %fabsl = tail call x86_fp80 @fabsl(x86_fp80 %x)
+  ret x86_fp80 %fabsl
+
+}
+
+; CHECK-LABEL: @fmf_replace_fabs_call_f80(
+; CHECK-NEXT: %fabsl = call nnan x86_fp80 @llvm.fabs.f80(x86_fp80 %x)
+; CHECK-NEXT: ret x86_fp80 %fabsl
+define x86_fp80 @fmf_replace_fabs_call_f80(x86_fp80 %x) {
+  %fabsl = tail call nnan x86_fp80 @fabsl(x86_fp80 %x)
+  ret x86_fp80 %fabsl
+}
+
diff --git a/test/Transforms/InstCombine/fabs.ll b/test/Transforms/InstCombine/fabs.ll
index aee853ae9eeb..a95f7b306b55 100644
--- a/test/Transforms/InstCombine/fabs.ll
+++ b/test/Transforms/InstCombine/fabs.ll
@@ -1,6 +1,10 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu < %s -instcombine -S | FileCheck %s
 
-; Make sure all library calls are eliminated when the input is known positive.
+; Make sure libcalls are replaced with intrinsic calls.
+
+declare float @llvm.fabs.f32(float)
+declare double @llvm.fabs.f64(double)
+declare fp128 @llvm.fabs.f128(fp128)
 
 declare float @fabsf(float)
 declare double @fabs(double)
@@ -8,46 +12,46 @@ declare fp128 @fabsl(fp128)
 declare float @llvm.fma.f32(float, float, float)
 declare float @llvm.fmuladd.f32(float, float, float)
 
-define float @square_fabs_call_f32(float %x) {
-  %mul = fmul float %x, %x
-  %fabsf = tail call float @fabsf(float %mul)
+define float @replace_fabs_call_f32(float %x) {
+  %fabsf = tail call float @fabsf(float %x)
   ret float %fabsf
 
-; CHECK-LABEL: square_fabs_call_f32(
-; CHECK-NEXT: %mul = fmul float %x, %x
-; CHECK-NEXT: %fabsf = tail call float @fabsf(float %mul)
+; CHECK-LABEL: @replace_fabs_call_f32(
+; CHECK-NEXT: %fabsf = call float @llvm.fabs.f32(float %x)
 ; CHECK-NEXT: ret float %fabsf
 }
 
-define double @square_fabs_call_f64(double %x) {
-  %mul = fmul double %x, %x
-  %fabs = tail call double @fabs(double %mul)
+define double @replace_fabs_call_f64(double %x) {
+  %fabs = tail call double @fabs(double %x)
   ret double %fabs
 
-; CHECK-LABEL: square_fabs_call_f64(
-; CHECK-NEXT: %mul = fmul double %x, %x
-; CHECK-NEXT: %fabs = tail call double @fabs(double %mul)
+; CHECK-LABEL: @replace_fabs_call_f64(
+; CHECK-NEXT: %fabs = call double @llvm.fabs.f64(double %x)
 ; CHECK-NEXT: ret double %fabs
 }
 
-define fp128 @square_fabs_call_f128(fp128 %x) {
-  %mul = fmul fp128 %x, %x
-  %fabsl = tail call fp128 @fabsl(fp128 %mul)
+define fp128 @replace_fabs_call_f128(fp128 %x) {
+  %fabsl = tail call fp128 @fabsl(fp128 %x)
   ret fp128 %fabsl
 
-; CHECK-LABEL: square_fabs_call_f128(
-; CHECK-NEXT: %mul = fmul fp128 %x, %x
-; CHECK-NEXT: %fabsl = tail call fp128 @fabsl(fp128 %mul)
+; CHECK-LABEL: replace_fabs_call_f128(
+; CHECK-NEXT: %fabsl = call fp128 @llvm.fabs.f128(fp128 %x)
 ; CHECK-NEXT: ret fp128 %fabsl
 }
 
+; Make sure fast math flags are preserved when replacing the libcall.
+define float @fmf_replace_fabs_call_f32(float %x) {
+  %fabsf = tail call nnan float @fabsf(float %x)
+  ret float %fabsf
+
+; CHECK-LABEL: @fmf_replace_fabs_call_f32(
+; CHECK-NEXT: %fabsf = call nnan float @llvm.fabs.f32(float %x)
+; CHECK-NEXT: ret float %fabsf
+}
+
 ; Make sure all intrinsic calls are eliminated when the input is known
 ; positive.
 
-declare float @llvm.fabs.f32(float)
-declare double @llvm.fabs.f64(double)
-declare fp128 @llvm.fabs.f128(fp128)
-
 ; The fabs cannot be eliminated because %x may be a NaN
 define float @square_fabs_intrinsic_f32(float %x) {
   %mul = fmul float %x, %x
@@ -102,10 +106,8 @@ define float @square_fabs_shrink_call1(float %x) {
   ret float %trunc
 
 ; CHECK-LABEL: square_fabs_shrink_call1(
-; CHECK-NEXT: %ext = fpext float %x to double
-; CHECK-NEXT: %sq = fmul double %ext, %ext
-; CHECK-NEXT: call double @fabs(double %sq)
-; CHECK-NEXT: %trunc = fptrunc double %fabs to float
+; CHECK-NEXT: fmul float %x, %x
+; CHECK-NEXT: %trunc = call float @llvm.fabs.f32(float
 ; CHECK-NEXT: ret float %trunc
 }
 
@@ -118,8 +120,8 @@ define float @square_fabs_shrink_call2(float %x) {
 
 ; CHECK-LABEL: square_fabs_shrink_call2(
 ; CHECK-NEXT: %sq = fmul float %x, %x
-; CHECK-NEXT: %fabsf = call float @fabsf(float %sq)
-; CHECK-NEXT: ret float %fabsf
+; CHECK-NEXT: %trunc = call float @llvm.fabs.f32(float %sq)
+; CHECK-NEXT: ret float %trunc
 }
 
 ; CHECK-LABEL: @fabs_select_constant_negative_positive(
@@ -214,3 +216,16 @@ define float @square_nnan_fmuladd_fabs_intrinsic_f32(float %x) {
 ; CHECK-NEXT: %fmuladd = call nnan float @llvm.fmuladd.f32(float %x, float %x, float 1.000000e+00)
 ; CHECK-NEXT: ret float %fmuladd
 }
+
+; Don't introduce a second fpext
+; CHECK-LABEL: @multi_use_fabs_fpext(
+; CHECK: %fpext = fpext float %x to double
+; CHECK-NEXT: %fabs = call double @llvm.fabs.f64(double %fpext)
+; CHECK-NEXT: store volatile double %fpext, double* undef, align 8
+; CHECK-NEXT: ret double %fabs
+define double @multi_use_fabs_fpext(float %x) {
+  %fpext = fpext float %x to double
+  %fabs = call double @llvm.fabs.f64(double %fpext)
+  store volatile double %fpext, double* undef
+  ret double %fabs
+}
diff --git a/test/Transforms/InstCombine/fast-math.ll b/test/Transforms/InstCombine/fast-math.ll
index ad8a9247e4e1..6ddf3a58529f 100644
--- a/test/Transforms/InstCombine/fast-math.ll
+++ b/test/Transforms/InstCombine/fast-math.ll
@@ -831,3 +831,26 @@ define fp128 @min4(fp128 %a, fp128 %b) {
 ; CHECK-NEXT:  select {{.*}} fp128 %a, fp128 %b 
 ; CHECK-NEXT:  ret
 }
+
+define float @test55(i1 %which, float %a) {
+; CHECK-LABEL: @test55(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    [[PHITMP:%.*]] = fadd fast float [[A:%.*]], 1.000000e+00
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi float [ 3.000000e+00, [[ENTRY:%.*]] ], [ [[PHITMP]], [[DELAY]] ]
+; CHECK-NEXT:    ret float [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi float [ 2.0, %entry ], [ %a, %delay ]
+  %value = fadd fast float %A, 1.0
+  ret float %value
+}
diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index 7fd46f228183..40f7bf9b64fa 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -3,238 +3,291 @@
 declare double @llvm.fabs.f64(double) nounwind readnone
 
 define i1 @test1(float %x, float %y) nounwind {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float %x, %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
   %ext1 = fpext float %x to double
   %ext2 = fpext float %y to double
   %cmp = fcmp ogt double %ext1, %ext2
   ret i1 %cmp
-; CHECK-LABEL: @test1(
-; CHECK-NEXT: fcmp ogt float %x, %y
 }
 
 define i1 @test2(float %a) nounwind {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float %a, 1.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
   %ext = fpext float %a to double
   %cmp = fcmp ogt double %ext, 1.000000e+00
   ret i1 %cmp
-; CHECK-LABEL: @test2(
-; CHECK-NEXT: fcmp ogt float %a, 1.0
 }
 
 define i1 @test3(float %a) nounwind {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[EXT:%.*]] = fpext float %a to double
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt double [[EXT]], 0x3FF0000000000001
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
   %ext = fpext float %a to double
   %cmp = fcmp ogt double %ext, 0x3FF0000000000001 ; more precision than float.
   ret i1 %cmp
-; CHECK-LABEL: @test3(
-; CHECK-NEXT: fpext float %a to double
 }
 
 define i1 @test4(float %a) nounwind {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[EXT:%.*]] = fpext float %a to double
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt double [[EXT]], 0x36A0000000000000
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
   %ext = fpext float %a to double
   %cmp = fcmp ogt double %ext, 0x36A0000000000000 ; denormal in float.
   ret i1 %cmp
-; CHECK-LABEL: @test4(
-; CHECK-NEXT: fpext float %a to double
 }
 
 define i1 @test5(float %a) nounwind {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float %a, -1.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
   %neg = fsub float -0.000000e+00, %a
   %cmp = fcmp ogt float %neg, 1.000000e+00
   ret i1 %cmp
-; CHECK-LABEL: @test5(
-; CHECK-NEXT: fcmp olt float %a, -1.0
 }
 
 define i1 @test6(float %x, float %y) nounwind {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float %x, %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
   %neg1 = fsub float -0.000000e+00, %x
   %neg2 = fsub float -0.000000e+00, %y
   %cmp = fcmp olt float %neg1, %neg2
   ret i1 %cmp
-; CHECK-LABEL: @test6(
-; CHECK-NEXT: fcmp ogt float %x, %y
 }
 
 define i1 @test7(float %x) nounwind readnone ssp noredzone {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float %x, 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
   %ext = fpext float %x to ppc_fp128
   %cmp = fcmp ogt ppc_fp128 %ext, 0xM00000000000000000000000000000000
   ret i1 %cmp
-; CHECK-LABEL: @test7(
-; CHECK-NEXT: fcmp ogt float %x, 0.000000e+00
 }
 
 define float @test8(float %x) nounwind readnone optsize ssp {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float %x, 0.000000e+00
+; CHECK-NEXT:    [[CONV2:%.*]] = uitofp i1 [[CMP]] to float
+; CHECK-NEXT:    ret float [[CONV2]]
+;
   %conv = fpext float %x to double
   %cmp = fcmp olt double %conv, 0.000000e+00
   %conv1 = zext i1 %cmp to i32
   %conv2 = sitofp i32 %conv1 to float
   ret float %conv2
 ; Float comparison to zero shouldn't cast to double.
-; CHECK-LABEL: @test8(
-; CHECK-NEXT: fcmp olt float %x, 0.000000e+00
 }
 
 declare double @fabs(double) nounwind readnone
 
 define i32 @test9(double %a) nounwind {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    ret i32 0
+;
   %call = tail call double @fabs(double %a) nounwind
   %cmp = fcmp olt double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test9(
-; CHECK-NOT: fabs
-; CHECK: ret i32 0
 }
 
 define i32 @test9_intrinsic(double %a) nounwind {
+; CHECK-LABEL: @test9_intrinsic(
+; CHECK-NEXT:    ret i32 0
+;
   %call = tail call double @llvm.fabs.f64(double %a) nounwind
   %cmp = fcmp olt double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test9_intrinsic(
-; CHECK-NOT: fabs
-; CHECK: ret i32 0
 }
 
 define i32 @test10(double %a) nounwind {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @fabs(double %a) nounwind
   %cmp = fcmp ole double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test10(
-; CHECK-NOT: fabs
-; CHECK: fcmp oeq double %a, 0.000000e+00
 }
 
 define i32 @test10_intrinsic(double %a) nounwind {
+; CHECK-LABEL: @test10_intrinsic(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @llvm.fabs.f64(double %a) nounwind
   %cmp = fcmp ole double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test10_intrinsic(
-; CHECK-NOT: fabs
-; CHECK: fcmp oeq double %a, 0.000000e+00
 }
 
 define i32 @test11(double %a) nounwind {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @fabs(double %a) nounwind
   %cmp = fcmp ogt double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test11(
-; CHECK-NOT: fabs
-; CHECK: fcmp one double %a, 0.000000e+00
 }
 
 define i32 @test11_intrinsic(double %a) nounwind {
+; CHECK-LABEL: @test11_intrinsic(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @llvm.fabs.f64(double %a) nounwind
   %cmp = fcmp ogt double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test11_intrinsic(
-; CHECK-NOT: fabs
-; CHECK: fcmp one double %a, 0.000000e+00
 }
 
 define i32 @test12(double %a) nounwind {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @fabs(double %a) nounwind
   %cmp = fcmp oge double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test12(
-; CHECK-NOT: fabs
-; CHECK: fcmp ord double %a, 0.000000e+00
 }
 
 define i32 @test12_intrinsic(double %a) nounwind {
+; CHECK-LABEL: @test12_intrinsic(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @llvm.fabs.f64(double %a) nounwind
   %cmp = fcmp oge double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test12_intrinsic(
-; CHECK-NOT: fabs
-; CHECK: fcmp ord double %a, 0.000000e+00
 }
 
 define i32 @test13(double %a) nounwind {
+; CHECK-LABEL: @test13(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @fabs(double %a) nounwind
   %cmp = fcmp une double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test13(
-; CHECK-NOT: fabs
-; CHECK: fcmp une double %a, 0.000000e+00
 }
 
 define i32 @test13_intrinsic(double %a) nounwind {
+; CHECK-LABEL: @test13_intrinsic(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @llvm.fabs.f64(double %a) nounwind
   %cmp = fcmp une double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test13_intrinsic(
-; CHECK-NOT: fabs
-; CHECK: fcmp une double %a, 0.000000e+00
 }
 
 define i32 @test14(double %a) nounwind {
+; CHECK-LABEL: @test14(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @fabs(double %a) nounwind
   %cmp = fcmp oeq double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test14(
-; CHECK-NOT: fabs
-; CHECK: fcmp oeq double %a, 0.000000e+00
 }
 
 define i32 @test14_intrinsic(double %a) nounwind {
+; CHECK-LABEL: @test14_intrinsic(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @llvm.fabs.f64(double %a) nounwind
   %cmp = fcmp oeq double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test14_intrinsic(
-; CHECK-NOT: fabs
-; CHECK: fcmp oeq double %a, 0.000000e+00
 }
 
 define i32 @test15(double %a) nounwind {
+; CHECK-LABEL: @test15(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @fabs(double %a) nounwind
   %cmp = fcmp one double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test15(
-; CHECK-NOT: fabs
-; CHECK: fcmp one double %a, 0.000000e+00
 }
 
 define i32 @test15_intrinsic(double %a) nounwind {
+; CHECK-LABEL: @test15_intrinsic(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @llvm.fabs.f64(double %a) nounwind
   %cmp = fcmp one double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test15_intrinsic(
-; CHECK-NOT: fabs
-; CHECK: fcmp one double %a, 0.000000e+00
 }
 
 define i32 @test16(double %a) nounwind {
+; CHECK-LABEL: @test16(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @fabs(double %a) nounwind
   %cmp = fcmp ueq double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test16(
-; CHECK-NOT: fabs
-; CHECK: fcmp ueq double %a, 0.000000e+00
 }
 
 define i32 @test16_intrinsic(double %a) nounwind {
+; CHECK-LABEL: @test16_intrinsic(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @llvm.fabs.f64(double %a) nounwind
   %cmp = fcmp ueq double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test16_intrinsic(
-; CHECK-NOT: fabs
-; CHECK: fcmp ueq double %a, 0.000000e+00
 }
 
 ; Don't crash.
 define i32 @test17(double %a, double (double)* %p) nounwind {
+; CHECK-LABEL: @test17(
+; CHECK-NEXT:    [[CALL:%.*]] = tail call double %p(double %a) #1
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq double [[CALL]], 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double %p(double %a) nounwind
   %cmp = fcmp ueq double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
@@ -243,16 +296,18 @@ define i32 @test17(double %a, double (double)* %p) nounwind {
 
 ; Can fold fcmp with undef on one side by choosing NaN for the undef
 define i32 @test18_undef_unordered(float %a) nounwind {
-; CHECK-LABEL: @test18_undef_unordered
-; CHECK: ret i32 1
+; CHECK-LABEL: @test18_undef_unordered(
+; CHECK-NEXT:    ret i32 1
+;
   %cmp = fcmp ueq float %a, undef
   %conv = zext i1 %cmp to i32
   ret i32 %conv
 }
 ; Can fold fcmp with undef on one side by choosing NaN for the undef
 define i32 @test18_undef_ordered(float %a) nounwind {
-; CHECK-LABEL: @test18_undef_ordered
-; CHECK: ret i32 0
+; CHECK-LABEL: @test18_undef_ordered(
+; CHECK-NEXT:    ret i32 0
+;
   %cmp = fcmp oeq float %a, undef
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -264,14 +319,18 @@ define i32 @test18_undef_ordered(float %a) nounwind {
 ; because whatever you choose for the first undef
 ; you can choose NaN for the other undef
 define i1 @test19_undef_unordered() nounwind {
-; CHECK-LABEL: @test19_undef
-; CHECK: ret i1 true
+; CHECK-LABEL: @test19_undef_unordered(
+; CHECK-NEXT:    ret i1 true
+;
   %cmp = fcmp ueq float undef, undef
   ret i1 %cmp
 }
+
 define i1 @test19_undef_ordered() nounwind {
-; CHECK-LABEL: @test19_undef
-; CHECK: ret i1 false
+; CHECK-LABEL: @test19_undef_ordered(
+; CHECK-NEXT:    ret i1 false
+;
   %cmp = fcmp oeq float undef, undef
   ret i1 %cmp
 }
+
diff --git a/test/Transforms/InstCombine/float-shrink-compare.ll b/test/Transforms/InstCombine/float-shrink-compare.ll
index a08f9531d217..e0925952bf44 100644
--- a/test/Transforms/InstCombine/float-shrink-compare.ll
+++ b/test/Transforms/InstCombine/float-shrink-compare.ll
@@ -3,171 +3,329 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-apple-macosx10.8.0"
 
 define i32 @test1(float %x, float %y) nounwind uwtable {
-  %1 = fpext float %x to double
-  %2 = call double @ceil(double %1) nounwind readnone
-  %3 = fpext float %y to double
-  %4 = fcmp oeq double %2, %3
-  %5 = zext i1 %4 to i32
-  ret i32 %5
+  %x.ext = fpext float %x to double
+  %ceil = call double @ceil(double %x.ext) nounwind readnone
+  %ext.y = fpext float %y to double
+  %cmp = fcmp oeq double %ceil, %ext.y
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT: %ceilf = call float @ceilf(float %x)
-; CHECK-NEXT: fcmp oeq float %ceilf, %y
+; CHECK-NEXT: %ceil = call float @llvm.ceil.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %ceil, %y
+}
+
+define i32 @test1_intrin(float %x, float %y) nounwind uwtable {
+  %x.ext = fpext float %x to double
+  %ceil = call double @llvm.ceil.f64(double %x.ext) nounwind readnone
+  %ext.y = fpext float %y to double
+  %cmp = fcmp oeq double %ceil, %ext.y
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
+; CHECK-LABEL: @test1_intrin(
+; CHECK-NEXT: %ceil = call float @llvm.ceil.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %ceil, %y
 }
 
 define i32 @test2(float %x, float %y) nounwind uwtable {
-  %1 = fpext float %x to double
-  %2 = call double @fabs(double %1) nounwind readnone
-  %3 = fpext float %y to double
-  %4 = fcmp oeq double %2, %3
-  %5 = zext i1 %4 to i32
-  ret i32 %5
+  %x.ext = fpext float %x to double
+  %fabs = call double @fabs(double %x.ext) nounwind readnone
+  %y.ext = fpext float %y to double
+  %cmp = fcmp oeq double %fabs, %y.ext
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT: %fabsf = call float @fabsf(float %x)
-; CHECK-NEXT: fcmp oeq float %fabsf, %y
+; CHECK-NEXT: %fabs = call float @llvm.fabs.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %fabs, %y
 }
 
-define i32 @test3(float %x, float %y) nounwind uwtable {
+define i32 @test2_intrin(float %x, float %y) nounwind uwtable {
+  %x.ext = fpext float %x to double
+  %fabs = call double @llvm.fabs.f64(double %x.ext) nounwind readnone
+  %y.ext = fpext float %y to double
+  %cmp = fcmp oeq double %fabs, %y.ext
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
+; CHECK-LABEL: @test2_intrin(
+; CHECK-NEXT: %fabs = call float @llvm.fabs.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %fabs, %y
+}
+
+define i32 @fmf_test2(float %x, float %y) nounwind uwtable {
   %1 = fpext float %x to double
-  %2 = call double @floor(double %1) nounwind readnone
+  %2 = call nnan double @fabs(double %1) nounwind readnone
   %3 = fpext float %y to double
   %4 = fcmp oeq double %2, %3
   %5 = zext i1 %4 to i32
   ret i32 %5
+; CHECK-LABEL: @fmf_test2(
+; CHECK-NEXT: [[FABS:%[0-9]+]] = call nnan float @llvm.fabs.f32(float %x)
+; CHECK-NEXT: fcmp oeq float [[FABS]], %y
+}
+
+define i32 @test3(float %x, float %y) nounwind uwtable {
+  %x.ext = fpext float %x to double
+  %floor = call double @floor(double %x.ext) nounwind readnone
+  %y.ext = fpext float %y to double
+  %cmp = fcmp oeq double %floor, %y.ext
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT: %floorf = call float @floorf(float %x)
-; CHECK-NEXT: fcmp oeq float %floorf, %y
+; CHECK-NEXT: %floor = call float @llvm.floor.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %floor, %y
+}
+
+
+define i32 @test3_intrin(float %x, float %y) nounwind uwtable {
+  %x.ext = fpext float %x to double
+  %floor = call double @llvm.floor.f64(double %x.ext) nounwind readnone
+  %y.ext = fpext float %y to double
+  %cmp = fcmp oeq double %floor, %y.ext
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
+; CHECK-LABEL: @test3_intrin(
+; CHECK-NEXT: %floor = call float @llvm.floor.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %floor, %y
 }
 
 define i32 @test4(float %x, float %y) nounwind uwtable {
-  %1 = fpext float %x to double
-  %2 = call double @nearbyint(double %1) nounwind
-  %3 = fpext float %y to double
-  %4 = fcmp oeq double %2, %3
-  %5 = zext i1 %4 to i32
-  ret i32 %5
+  %x.ext = fpext float %x to double
+  %nearbyint = call double @nearbyint(double %x.ext) nounwind
+  %y.ext = fpext float %y to double
+  %cmp = fcmp oeq double %nearbyint, %y.ext
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
 ; CHECK-LABEL: @test4(
-; CHECK-NEXT: %nearbyintf = call float @nearbyintf(float %x)
-; CHECK-NEXT: fcmp oeq float %nearbyintf, %y
+; CHECK-NEXT: %nearbyint = call float @llvm.nearbyint.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %nearbyint, %y
+}
+
+define i32 @shrink_nearbyint_intrin(float %x, float %y) nounwind uwtable {
+  %x.ext = fpext float %x to double
+  %nearbyint = call double @llvm.nearbyint.f64(double %x.ext) nounwind
+  %y.ext = fpext float %y to double
+  %cmp = fcmp oeq double %nearbyint, %y.ext
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
+; CHECK-LABEL: @shrink_nearbyint_intrin(
+; CHECK-NEXT: %nearbyint = call float @llvm.nearbyint.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %nearbyint, %y
 }
 
 define i32 @test5(float %x, float %y) nounwind uwtable {
-  %1 = fpext float %x to double
-  %2 = call double @rint(double %1) nounwind
-  %3 = fpext float %y to double
-  %4 = fcmp oeq double %2, %3
-  %5 = zext i1 %4 to i32
-  ret i32 %5
+  %x.ext = fpext float %x to double
+  %rint = call double @rint(double %x.ext) nounwind
+  %y.ext = fpext float %y to double
+  %cmp = fcmp oeq double %rint, %y.ext
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
 ; CHECK-LABEL: @test5(
-; CHECK-NEXT: %rintf = call float @rintf(float %x)
-; CHECK-NEXT: fcmp oeq float %rintf, %y
+; CHECK-NEXT: %rint = call float @llvm.rint.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %rint, %y
 }
 
 define i32 @test6(float %x, float %y) nounwind uwtable {
-  %1 = fpext float %x to double
-  %2 = call double @round(double %1) nounwind readnone
-  %3 = fpext float %y to double
-  %4 = fcmp oeq double %2, %3
-  %5 = zext i1 %4 to i32
-  ret i32 %5
+  %x.ext = fpext float %x to double
+  %round = call double @round(double %x.ext) nounwind readnone
+  %y.ext = fpext float %y to double
+  %cmp = fcmp oeq double %round, %y.ext
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
 ; CHECK-LABEL: @test6(
-; CHECK-NEXT: %roundf = call float @roundf(float %x)
-; CHECK-NEXT: fcmp oeq float %roundf, %y
+; CHECK-NEXT: %round = call float @llvm.round.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %round, %y
+}
+
+define i32 @test6_intrin(float %x, float %y) nounwind uwtable {
+  %x.ext = fpext float %x to double
+  %round = call double @llvm.round.f64(double %x.ext) nounwind readnone
+  %y.ext = fpext float %y to double
+  %cmp = fcmp oeq double %round, %y.ext
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
+; CHECK-LABEL: @test6_intrin(
+; CHECK-NEXT: %round = call float @llvm.round.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %round, %y
 }
 
 define i32 @test7(float %x, float %y) nounwind uwtable {
-  %1 = fpext float %x to double
-  %2 = call double @trunc(double %1) nounwind
-  %3 = fpext float %y to double
-  %4 = fcmp oeq double %2, %3
-  %5 = zext i1 %4 to i32
-  ret i32 %5
+  %x.ext = fpext float %x to double
+  %trunc = call double @trunc(double %x.ext) nounwind
+  %y.ext = fpext float %y to double
+  %cmp = fcmp oeq double %trunc, %y.ext
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
 ; CHECK-LABEL: @test7(
-; CHECK-NEXT: %truncf = call float @truncf(float %x)
-; CHECK-NEXT: fcmp oeq float %truncf, %y
+; CHECK-NEXT: %trunc = call float @llvm.trunc.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %trunc, %y
+}
+
+define i32 @test7_intrin(float %x, float %y) nounwind uwtable {
+  %x.ext = fpext float %x to double
+  %trunc = call double @llvm.trunc.f64(double %x.ext) nounwind
+  %y.ext = fpext float %y to double
+  %cmp = fcmp oeq double %trunc, %y.ext
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
+; CHECK-LABEL: @test7_intrin(
+; CHECK-NEXT: %trunc = call float @llvm.trunc.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %trunc, %y
 }
 
 define i32 @test8(float %x, float %y) nounwind uwtable {
-  %1 = fpext float %y to double
-  %2 = fpext float %x to double
-  %3 = call double @ceil(double %2) nounwind readnone
-  %4 = fcmp oeq double %1, %3
-  %5 = zext i1 %4 to i32
-  ret i32 %5
+  %x.ext = fpext float %x to double
+  %y.ext = fpext float %y to double
+  %ceil = call double @ceil(double %x.ext) nounwind readnone
+  %cmp = fcmp oeq double %y.ext, %ceil
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
 ; CHECK-LABEL: @test8(
-; CHECK-NEXT: %ceilf = call float @ceilf(float %x)
-; CHECK-NEXT: fcmp oeq float %ceilf, %y
+; CHECK-NEXT: %ceil = call float @llvm.ceil.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %ceil, %y
+}
+
+define i32 @test8_intrin(float %x, float %y) nounwind uwtable {
+  %x.ext = fpext float %x to double
+  %y.ext = fpext float %y to double
+  %ceil = call double @llvm.ceil.f64(double %x.ext) nounwind readnone
+  %cmp = fcmp oeq double %y.ext, %ceil
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
+; CHECK-LABEL: @test8_intrin(
+; CHECK-NEXT: %ceil = call float @llvm.ceil.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %ceil, %y
 }
 
 define i32 @test9(float %x, float %y) nounwind uwtable {
-  %1 = fpext float %y to double
-  %2 = fpext float %x to double
-  %3 = call double @fabs(double %2) nounwind readnone
-  %4 = fcmp oeq double %1, %3
-  %5 = zext i1 %4 to i32
-  ret i32 %5
+  %x.ext = fpext float %x to double
+  %y.ext = fpext float %y to double
+  %fabs = call double @fabs(double %x.ext) nounwind readnone
+  %cmp = fcmp oeq double %y.ext, %fabs
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
 ; CHECK-LABEL: @test9(
-; CHECK-NEXT: %fabsf = call float @fabsf(float %x)
-; CHECK-NEXT: fcmp oeq float %fabsf, %y
+; CHECK-NEXT: %fabs = call float @llvm.fabs.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %fabs, %y
+}
+
+define i32 @test9_intrin(float %x, float %y) nounwind uwtable {
+  %x.ext = fpext float %x to double
+  %y.ext = fpext float %y to double
+  %fabs = call double @llvm.fabs.f64(double %x.ext) nounwind readnone
+  %cmp = fcmp oeq double %y.ext, %fabs
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
+; CHECK-LABEL: @test9_intrin(
+; CHECK-NEXT: %fabs = call float @llvm.fabs.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %fabs, %y
 }
 
 define i32 @test10(float %x, float %y) nounwind uwtable {
-  %1 = fpext float %y to double
-  %2 = fpext float %x to double
-  %3 = call double @floor(double %2) nounwind readnone
-  %4 = fcmp oeq double %1, %3
-  %5 = zext i1 %4 to i32
-  ret i32 %5
+  %x.ext = fpext float %x to double
+  %y.ext = fpext float %y to double
+  %floor = call double @floor(double %x.ext) nounwind readnone
+  %cmp = fcmp oeq double %floor, %y.ext
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
 ; CHECK-LABEL: @test10(
-; CHECK-NEXT: %floorf = call float @floorf(float %x)
-; CHECK-NEXT: fcmp oeq float %floorf, %y
+; CHECK-NEXT: %floor = call float @llvm.floor.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %floor, %y
+}
+
+define i32 @test10_intrin(float %x, float %y) nounwind uwtable {
+  %x.ext = fpext float %x to double
+  %y.ext = fpext float %y to double
+  %floor = call double @llvm.floor.f64(double %x.ext) nounwind readnone
+  %cmp = fcmp oeq double %floor, %y.ext
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
+; CHECK-LABEL: @test10_intrin(
+; CHECK-NEXT: %floor = call float @llvm.floor.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %floor, %y
 }
 
 define i32 @test11(float %x, float %y) nounwind uwtable {
-  %1 = fpext float %y to double
-  %2 = fpext float %x to double
-  %3 = call double @nearbyint(double %2) nounwind
-  %4 = fcmp oeq double %1, %3
-  %5 = zext i1 %4 to i32
-  ret i32 %5
+  %x.ext = fpext float %x to double
+  %y.ext = fpext float %y to double
+  %nearbyint = call double @nearbyint(double %x.ext) nounwind
+  %cmp = fcmp oeq double %nearbyint, %y.ext
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
 ; CHECK-LABEL: @test11(
-; CHECK-NEXT: %nearbyintf = call float @nearbyintf(float %x)
-; CHECK-NEXT: fcmp oeq float %nearbyintf, %y
+; CHECK-NEXT: %nearbyint = call float @llvm.nearbyint.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %nearbyint, %y
+}
+
+
+define i32 @test11_intrin(float %x, float %y) nounwind uwtable {
+  %x.ext = fpext float %x to double
+  %y.ext = fpext float %y to double
+  %nearbyint = call double @llvm.nearbyint.f64(double %x.ext) nounwind
+  %cmp = fcmp oeq double %nearbyint, %y.ext
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
+; CHECK-LABEL: @test11_intrin(
+; CHECK-NEXT: %nearbyint = call float @llvm.nearbyint.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %nearbyint, %y
 }
 
 define i32 @test12(float %x, float %y) nounwind uwtable {
-  %1 = fpext float %y to double
-  %2 = fpext float %x to double
-  %3 = call double @rint(double %2) nounwind
-  %4 = fcmp oeq double %1, %3
-  %5 = zext i1 %4 to i32
-  ret i32 %5
+  %x.ext = fpext float %x to double
+  %y.ext = fpext float %y to double
+  %rint = call double @rint(double %x.ext) nounwind
+  %cmp = fcmp oeq double %y.ext, %rint
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
 ; CHECK-LABEL: @test12(
-; CHECK-NEXT: %rintf = call float @rintf(float %x)
-; CHECK-NEXT: fcmp oeq float %rintf, %y
+; CHECK-NEXT: %rint = call float @llvm.rint.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %rint, %y
 }
 
 define i32 @test13(float %x, float %y) nounwind uwtable {
-  %1 = fpext float %y to double
-  %2 = fpext float %x to double
-  %3 = call double @round(double %2) nounwind readnone
-  %4 = fcmp oeq double %1, %3
-  %5 = zext i1 %4 to i32
-  ret i32 %5
+  %x.ext = fpext float %x to double
+  %y.ext = fpext float %y to double
+  %round = call double @round(double %x.ext) nounwind readnone
+  %cmp = fcmp oeq double %y.ext, %round
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
 ; CHECK-LABEL: @test13(
-; CHECK-NEXT: %roundf = call float @roundf(float %x)
-; CHECK-NEXT: fcmp oeq float %roundf, %y
+; CHECK-NEXT: %round = call float @llvm.round.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %round, %y
+}
+
+define i32 @test13_intrin(float %x, float %y) nounwind uwtable {
+  %x.ext = fpext float %x to double
+  %y.ext = fpext float %y to double
+  %round = call double @llvm.round.f64(double %x.ext) nounwind readnone
+  %cmp = fcmp oeq double %y.ext, %round
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
+; CHECK-LABEL: @test13_intrin(
+; CHECK-NEXT: %round = call float @llvm.round.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %round, %y
 }
 
 define i32 @test14(float %x, float %y) nounwind uwtable {
-  %1 = fpext float %y to double
-  %2 = fpext float %x to double
-  %3 = call double @trunc(double %2) nounwind
-  %4 = fcmp oeq double %1, %3
-  %5 = zext i1 %4 to i32
-  ret i32 %5
+  %x.ext = fpext float %x to double
+  %y.ext = fpext float %y to double
+  %trunc = call double @trunc(double %x.ext) nounwind
+  %cmp = fcmp oeq double %y.ext, %trunc
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
 ; CHECK-LABEL: @test14(
-; CHECK-NEXT: %truncf = call float @truncf(float %x)
-; CHECK-NEXT: fcmp oeq float %truncf, %y
+; CHECK-NEXT: %trunc = call float @llvm.trunc.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %trunc, %y
+}
+
+define i32 @test14_intrin(float %x, float %y) nounwind uwtable {
+  %x.ext = fpext float %x to double
+  %y.ext = fpext float %y to double
+  %trunc = call double @llvm.trunc.f64(double %x.ext) nounwind
+  %cmp = fcmp oeq double %y.ext, %trunc
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
+; CHECK-LABEL: @test14_intrin(
+; CHECK-NEXT: %trunc = call float @llvm.trunc.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %trunc, %y
 }
 
 define i32 @test15(float %x, float %y, float %z) nounwind uwtable {
@@ -269,3 +427,10 @@ declare double @round(double) nounwind readnone
 declare double @trunc(double) nounwind readnone
 declare double @fmin(double, double) nounwind readnone
 declare double @fmax(double, double) nounwind readnone
+
+declare double @llvm.fabs.f64(double) nounwind readnone
+declare double @llvm.ceil.f64(double) nounwind readnone
+declare double @llvm.floor.f64(double) nounwind readnone
+declare double @llvm.nearbyint.f64(double) nounwind readnone
+declare double @llvm.round.f64(double) nounwind readnone
+declare double @llvm.trunc.f64(double) nounwind readnone
diff --git a/test/Transforms/InstCombine/fma.ll b/test/Transforms/InstCombine/fma.ll
index e41f1e7edd46..3808e07d89a0 100644
--- a/test/Transforms/InstCombine/fma.ll
+++ b/test/Transforms/InstCombine/fma.ll
@@ -78,7 +78,8 @@ define float @fmuladd_fneg_x_fneg_y(float %x, float %y, float %z) {
 }
 
 ; CHECK-LABEL: @fmuladd_fneg_x_fneg_y_fast(
-; CHECK: %fmuladd = call fast float @llvm.fmuladd.f32(float %x, float %y, float %z)
+; CHECK-NEXT: %1 = fmul fast float %x, %y
+; CHECK-NEXT: %fmuladd = fadd fast float %1, %z
 define float @fmuladd_fneg_x_fneg_y_fast(float %x, float %y, float %z) {
   %x.fneg = fsub float -0.0, %x
   %y.fneg = fsub float -0.0, %y
@@ -122,7 +123,8 @@ define float @fmuladd_fabs_x_fabs_x(float %x, float %z) {
 }
 
 ; CHECK-LABEL: @fmuladd_fabs_x_fabs_x_fast(
-; CHECK: %fmuladd = call fast float @llvm.fmuladd.f32(float %x, float %x, float %z)
+; CHECK-NEXT: %1 = fmul fast float %x, %x
+; CHECK-NEXT: %fmuladd = fadd fast float %1, %z
 define float @fmuladd_fabs_x_fabs_x_fast(float %x, float %z) {
   %x.fabs = call float @llvm.fabs.f32(float %x)
   %fmuladd = call fast float @llvm.fmuladd.f32(float %x.fabs, float %x.fabs, float %z)
@@ -144,7 +146,8 @@ define float @fma_k_y_z_fast(float %y, float %z) {
 }
 
 ; CHECK-LABEL: @fmuladd_k_y_z_fast(
-; CHECK: %fmuladd = call fast float @llvm.fmuladd.f32(float %y, float 4.000000e+00, float %z)
+; CHECK: %1 = fmul fast float %y, 4.000000e+00
+; CHECK-NEXT: %fmuladd = fadd fast float %1, %z
 define float @fmuladd_k_y_z_fast(float %y, float %z) {
   %fmuladd = call fast float @llvm.fmuladd.f32(float 4.0, float %y, float %z)
   ret float %fmuladd
diff --git a/test/Transforms/InstCombine/getelementptr.ll b/test/Transforms/InstCombine/getelementptr.ll
index 7ccbdf11fded..de8190da01c2 100644
--- a/test/Transforms/InstCombine/getelementptr.ll
+++ b/test/Transforms/InstCombine/getelementptr.ll
@@ -883,6 +883,33 @@ define %struct.C* @test46(%struct.C* %c1, %struct.C* %c2, i64 %N) {
 ; CHECK-NEXT:  ret %struct.C* [[GEP]]
 }
 
+define i32* @test47(i32* %I, i64 %C, i64 %D) {
+  %sub = sub i64 %D, %C
+  %A = getelementptr i32, i32* %I, i64 %C
+  %B = getelementptr i32, i32* %A, i64 %sub
+  ret i32* %B
+; CHECK-LABEL: @test47(
+; CHECK-NEXT: %B = getelementptr i32, i32* %I, i64 %D
+}
+
+define i32* @test48(i32* %I, i64 %C, i64 %D) {
+  %sub = sub i64 %D, %C
+  %A = getelementptr i32, i32* %I, i64 %sub
+  %B = getelementptr i32, i32* %A, i64 %C
+  ret i32* %B
+; CHECK-LABEL: @test48(
+; CHECK-NEXT: %B = getelementptr i32, i32* %I, i64 %D
+}
+
+define i32* @test49(i32* %I, i64 %C) {
+  %notC = xor i64 -1, %C
+  %A = getelementptr i32, i32* %I, i64 %C
+  %B = getelementptr i32, i32* %A, i64 %notC
+  ret i32* %B
+; CHECK-LABEL: @test49(
+; CHECK-NEXT: %B = getelementptr i32, i32* %I, i64 -1
+}
+
 define i32 addrspace(1)* @ascast_0_gep(i32* %p) nounwind {
 ; CHECK-LABEL: @ascast_0_gep(
 ; CHECK-NOT: getelementptr
@@ -904,4 +931,15 @@ define i32 addrspace(1)* @ascast_0_0_gep([128 x i32]* %p) nounwind {
   ret i32 addrspace(1)* %x
 }
 
+define <2 x i32*> @PR32414(i32** %ptr) {
+; CHECK-LABEL: @PR32414(
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32** %ptr to i32*
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], <2 x i64> <i64 0, i64 1>
+; CHECK-NEXT:    ret <2 x i32*> [[TMP1]]
+;
+  %tmp0 = bitcast i32** %ptr to i32*
+  %tmp1 = getelementptr inbounds i32, i32* %tmp0, <2 x i64> <i64 0, i64 1>
+  ret <2 x i32*> %tmp1
+}
+
 ; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/InstCombine/icmp-add.ll b/test/Transforms/InstCombine/icmp-add.ll
new file mode 100644
index 000000000000..efeb9d5bb45b
--- /dev/null
+++ b/test/Transforms/InstCombine/icmp-add.ll
@@ -0,0 +1,247 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; PR1949
+
+define i1 @test1(i32 %a) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[C:%.*]] = icmp ugt i32 %a, -5
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = add i32 %a, 4
+  %c = icmp ult i32 %b, 4
+  ret i1 %c
+}
+
+define <2 x i1> @test1vec(<2 x i32> %a) {
+; CHECK-LABEL: @test1vec(
+; CHECK-NEXT:    [[C:%.*]] = icmp ugt <2 x i32> %a, <i32 -5, i32 -5>
+; CHECK-NEXT:    ret <2 x i1> [[C]]
+;
+  %b = add <2 x i32> %a, <i32 4, i32 4>
+  %c = icmp ult <2 x i32> %b, <i32 4, i32 4>
+  ret <2 x i1> %c
+}
+
+define i1 @test2(i32 %a) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 %a, 4
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = sub i32 %a, 4
+  %c = icmp ugt i32 %b, -5
+  ret i1 %c
+}
+
+define <2 x i1> @test2vec(<2 x i32> %a) {
+; CHECK-LABEL: @test2vec(
+; CHECK-NEXT:    [[C:%.*]] = icmp ult <2 x i32> %a, <i32 4, i32 4>
+; CHECK-NEXT:    ret <2 x i1> [[C]]
+;
+  %b = sub <2 x i32> %a, <i32 4, i32 4>
+  %c = icmp ugt <2 x i32> %b, <i32 -5, i32 -5>
+  ret <2 x i1> %c
+}
+
+define i1 @test3(i32 %a) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i32 %a, 2147483643
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = add i32 %a, 4
+  %c = icmp slt i32 %b, 2147483652
+  ret i1 %c
+}
+
+define <2 x i1> @test3vec(<2 x i32> %a) {
+; CHECK-LABEL: @test3vec(
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt <2 x i32> %a, <i32 2147483643, i32 2147483643>
+; CHECK-NEXT:    ret <2 x i1> [[C]]
+;
+  %b = add <2 x i32> %a, <i32 4, i32 4>
+  %c = icmp slt <2 x i32> %b, <i32 2147483652, i32 2147483652>
+  ret <2 x i1> %c
+}
+
+define i1 @test4(i32 %a) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 %a, -4
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = add i32 %a, 2147483652
+  %c = icmp sge i32 %b, 4
+  ret i1 %c
+}
+
+define <2 x i1> @test4vec(<2 x i32> %a) {
+; CHECK-LABEL: @test4vec(
+; CHECK-NEXT:    [[C:%.*]] = icmp slt <2 x i32> %a, <i32 -4, i32 -4>
+; CHECK-NEXT:    ret <2 x i1> [[C]]
+;
+  %b = add <2 x i32> %a, <i32 2147483652, i32 2147483652>
+  %c = icmp sge <2 x i32> %b, <i32 4, i32 4>
+  ret <2 x i1> %c
+}
+
+; icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2), when C - C2 does not overflow.
+; This becomes equality because it's at the limit.
+
+define i1 @nsw_slt1(i8 %a) {
+; CHECK-LABEL: @nsw_slt1(
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 %a, -128
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = add nsw i8 %a, 100
+  %c = icmp slt i8 %b, -27
+  ret i1 %c
+}
+
+; icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2), when C - C2 does not overflow.
+; This becomes equality because it's at the limit.
+
+define i1 @nsw_slt2(i8 %a) {
+; CHECK-LABEL: @nsw_slt2(
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 %a, 127
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = add nsw i8 %a, -100
+  %c = icmp slt i8 %b, 27
+  ret i1 %c
+}
+
+; icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2), when C - C2 does not overflow.
+; Less than the limit, so the predicate doesn't change.
+
+define i1 @nsw_slt3(i8 %a) {
+; CHECK-LABEL: @nsw_slt3(
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 %a, -126
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = add nsw i8 %a, 100
+  %c = icmp slt i8 %b, -26
+  ret i1 %c
+}
+
+; icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2), when C - C2 does not overflow.
+; Less than the limit, so the predicate doesn't change.
+
+define i1 @nsw_slt4(i8 %a) {
+; CHECK-LABEL: @nsw_slt4(
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 %a, 126
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = add nsw i8 %a, -100
+  %c = icmp slt i8 %b, 26
+  ret i1 %c
+}
+
+; icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2), when C - C2 does not overflow.
+; Try sgt to make sure that works too.
+
+define i1 @nsw_sgt1(i8 %a) {
+; CHECK-LABEL: @nsw_sgt1(
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 %a, 127
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = add nsw i8 %a, -100
+  %c = icmp sgt i8 %b, 26
+  ret i1 %c
+}
+
+; icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2), when C - C2 does not overflow.
+; Try a vector type to make sure that works too.
+; FIXME: This should be 'eq 127' as above.
+
+define <2 x i1> @nsw_sgt2_splat_vec(<2 x i8> %a) {
+; CHECK-LABEL: @nsw_sgt2_splat_vec(
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt <2 x i8> %a, <i8 -126, i8 -126>
+; CHECK-NEXT:    ret <2 x i1> [[C]]
+;
+  %b = add nsw <2 x i8> %a, <i8 100, i8 100>
+  %c = icmp sgt <2 x i8> %b, <i8 -26, i8 -26>
+  ret <2 x i1> %c
+}
+
+; icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2), when C - C2 does not overflow.
+; Comparison with 0 doesn't need special-casing.
+
+define i1 @slt_zero_add_nsw(i32 %a) {
+; CHECK-LABEL: @slt_zero_add_nsw(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 %a, -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = add nsw i32 %a, 1
+  %cmp = icmp slt i32 %add, 0
+  ret i1 %cmp
+}
+
+; The same fold should work with vectors.
+
+define <2 x i1> @slt_zero_add_nsw_splat_vec(<2 x i8> %a) {
+; CHECK-LABEL: @slt_zero_add_nsw_splat_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> %a, <i8 -1, i8 -1>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %add = add nsw <2 x i8> %a, <i8 1, i8 1>
+  %cmp = icmp slt <2 x i8> %add, zeroinitializer
+  ret <2 x i1> %cmp
+}
+
+; Test the edges - instcombine should not interfere with simplification to constants.
+; Constant subtraction does not overflow, but this is false.
+
+define i1 @nsw_slt3_ov_no(i8 %a) {
+; CHECK-LABEL: @nsw_slt3_ov_no(
+; CHECK-NEXT:    ret i1 false
+;
+  %b = add nsw i8 %a, 100
+  %c = icmp slt i8 %b, -28
+  ret i1 %c
+}
+
+; Test the edges - instcombine should not interfere with simplification to constants.
+; Constant subtraction overflows. This is false.
+
+define i1 @nsw_slt4_ov(i8 %a) {
+; CHECK-LABEL: @nsw_slt4_ov(
+; CHECK-NEXT:    ret i1 false
+;
+  %b = add nsw i8 %a, 100
+  %c = icmp slt i8 %b, -29
+  ret i1 %c
+}
+
+; Test the edges - instcombine should not interfere with simplification to constants.
+; Constant subtraction overflows. This is true.
+
+define i1 @nsw_slt5_ov(i8 %a) {
+; CHECK-LABEL: @nsw_slt5_ov(
+; CHECK-NEXT:    ret i1 true
+;
+  %b = add nsw i8 %a, -100
+  %c = icmp slt i8 %b, 28
+  ret i1 %c
+}
+
+; InstCombine should not thwart this opportunity to simplify completely.
+
+define i1 @slt_zero_add_nsw_signbit(i8 %x) {
+; CHECK-LABEL: @slt_zero_add_nsw_signbit(
+; CHECK-NEXT:    ret i1 true
+;
+  %y = add nsw i8 %x, -128
+  %z = icmp slt i8 %y, 0
+  ret i1 %z
+}
+
+; InstCombine should not thwart this opportunity to simplify completely.
+
+define i1 @slt_zero_add_nuw_signbit(i8 %x) {
+; CHECK-LABEL: @slt_zero_add_nuw_signbit(
+; CHECK-NEXT:    ret i1 true
+;
+  %y = add nuw i8 %x, 128
+  %z = icmp slt i8 %y, 0
+  ret i1 %z
+}
+
diff --git a/test/Transforms/InstCombine/icmp-shl-nsw.ll b/test/Transforms/InstCombine/icmp-shl-nsw.ll
index 896a45625b9f..ba05302897e9 100644
--- a/test/Transforms/InstCombine/icmp-shl-nsw.ll
+++ b/test/Transforms/InstCombine/icmp-shl-nsw.ll
@@ -73,8 +73,7 @@ define <2 x i1> @icmp_shl_nsw_eq_vec(<2 x i32> %x) {
 
 define i1 @icmp_sgt1(i8 %x) {
 ; CHECK-LABEL: @icmp_sgt1(
-; CHECK-NEXT:    [[SHL_MASK:%.*]] = and i8 %x, 127
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[SHL_MASK]], 64
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 %x, -64
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl nsw i8 %x, 1
@@ -84,8 +83,7 @@ define i1 @icmp_sgt1(i8 %x) {
 
 define i1 @icmp_sgt2(i8 %x) {
 ; CHECK-LABEL: @icmp_sgt2(
-; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], -127
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 %x, -64
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl nsw i8 %x, 1
@@ -95,8 +93,7 @@ define i1 @icmp_sgt2(i8 %x) {
 
 define i1 @icmp_sgt3(i8 %x) {
 ; CHECK-LABEL: @icmp_sgt3(
-; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], -16
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 %x, -8
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl nsw i8 %x, 1
@@ -106,8 +103,7 @@ define i1 @icmp_sgt3(i8 %x) {
 
 define i1 @icmp_sgt4(i8 %x) {
 ; CHECK-LABEL: @icmp_sgt4(
-; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], -2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 %x, -1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl nsw i8 %x, 1
@@ -120,8 +116,7 @@ define i1 @icmp_sgt4(i8 %x) {
 
 define i1 @icmp_sgt5(i8 %x) {
 ; CHECK-LABEL: @icmp_sgt5(
-; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 %x, 0
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl nsw i8 %x, 1
@@ -131,8 +126,7 @@ define i1 @icmp_sgt5(i8 %x) {
 
 define i1 @icmp_sgt6(i8 %x) {
 ; CHECK-LABEL: @icmp_sgt6(
-; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], 16
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 %x, 8
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl nsw i8 %x, 1
@@ -142,8 +136,7 @@ define i1 @icmp_sgt6(i8 %x) {
 
 define i1 @icmp_sgt7(i8 %x) {
 ; CHECK-LABEL: @icmp_sgt7(
-; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], 124
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 %x, 62
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl nsw i8 %x, 1
@@ -155,8 +148,7 @@ define i1 @icmp_sgt7(i8 %x) {
 
 define i1 @icmp_sgt8(i8 %x) {
 ; CHECK-LABEL: @icmp_sgt8(
-; CHECK-NEXT:    [[SHL_MASK:%.*]] = and i8 %x, 127
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[SHL_MASK]], 63
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 %x, 63
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl nsw i8 %x, 1
@@ -170,8 +162,7 @@ define i1 @icmp_sgt8(i8 %x) {
 
 define i1 @icmp_sgt9(i8 %x) {
 ; CHECK-LABEL: @icmp_sgt9(
-; CHECK-NEXT:    [[SHL_MASK:%.*]] = and i8 %x, 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[SHL_MASK]], 0
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 %x, -1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl nsw i8 %x, 7
@@ -181,8 +172,7 @@ define i1 @icmp_sgt9(i8 %x) {
 
 define i1 @icmp_sgt10(i8 %x) {
 ; CHECK-LABEL: @icmp_sgt10(
-; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 7
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], -127
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 %x, -1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl nsw i8 %x, 7
@@ -192,8 +182,7 @@ define i1 @icmp_sgt10(i8 %x) {
 
 define i1 @icmp_sgt11(i8 %x) {
 ; CHECK-LABEL: @icmp_sgt11(
-; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 7
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], -2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 %x, -1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl nsw i8 %x, 7
@@ -205,8 +194,7 @@ define i1 @icmp_sgt11(i8 %x) {
 
 define <2 x i1> @icmp_sgt11_vec(<2 x i8> %x) {
 ; CHECK-LABEL: @icmp_sgt11_vec(
-; CHECK-NEXT:    [[SHL:%.*]] = shl nsw <2 x i8> %x, <i8 7, i8 7>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <2 x i8> [[SHL]], <i8 -2, i8 -2>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <2 x i8> %x, <i8 -1, i8 -1>
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %shl = shl nsw <2 x i8> %x, <i8 7, i8 7>
@@ -216,3 +204,153 @@ define <2 x i1> @icmp_sgt11_vec(<2 x i8> %x) {
 
 ; Known bits analysis returns false for compares with >=0.
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Repeat the shl nsw + sgt tests with predicate changed to 'sle'.
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Known bits analysis turns this into an equality predicate.
+
+define i1 @icmp_sle1(i8 %x) {
+; CHECK-LABEL: @icmp_sle1(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 %x, -64
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sle i8 %shl, -128
+  ret i1 %cmp
+}
+
+define i1 @icmp_sle2(i8 %x) {
+; CHECK-LABEL: @icmp_sle2(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 %x, -63
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sle i8 %shl, -127
+  ret i1 %cmp
+}
+
+define i1 @icmp_sle3(i8 %x) {
+; CHECK-LABEL: @icmp_sle3(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 %x, -7
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sle i8 %shl, -16
+  ret i1 %cmp
+}
+
+define i1 @icmp_sle4(i8 %x) {
+; CHECK-LABEL: @icmp_sle4(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 %x, 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sle i8 %shl, -2
+  ret i1 %cmp
+}
+
+; x <=s -1 is a sign bit test.
+; x <=s 0 is a sign bit test.
+
+define i1 @icmp_sle5(i8 %x) {
+; CHECK-LABEL: @icmp_sle5(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 %x, 1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sle i8 %shl, 1
+  ret i1 %cmp
+}
+
+define i1 @icmp_sle6(i8 %x) {
+; CHECK-LABEL: @icmp_sle6(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 %x, 9
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sle i8 %shl, 16
+  ret i1 %cmp
+}
+
+define i1 @icmp_sle7(i8 %x) {
+; CHECK-LABEL: @icmp_sle7(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 %x, 63
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sle i8 %shl, 124
+  ret i1 %cmp
+}
+
+; Known bits analysis turns this into an equality predicate.
+
+define i1 @icmp_sle8(i8 %x) {
+; CHECK-LABEL: @icmp_sle8(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 %x, 63
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sle i8 %shl, 125
+  ret i1 %cmp
+}
+
+; Compares with 126 and 127 are recognized as always true.
+
+; Known bits analysis turns this into an equality predicate.
+
+define i1 @icmp_sle9(i8 %x) {
+; CHECK-LABEL: @icmp_sle9(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 %x, -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 7
+  %cmp = icmp sle i8 %shl, -128
+  ret i1 %cmp
+}
+
+define i1 @icmp_sle10(i8 %x) {
+; CHECK-LABEL: @icmp_sle10(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 %x, 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 7
+  %cmp = icmp sle i8 %shl, -127
+  ret i1 %cmp
+}
+
+define i1 @icmp_sle11(i8 %x) {
+; CHECK-LABEL: @icmp_sle11(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 %x, 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 7
+  %cmp = icmp sle i8 %shl, -2
+  ret i1 %cmp
+}
+
+; Some of the earlier sgt/sle tests are transformed to eq/ne, but try a couple
+; of those explicitly, so we know no intermediate transforms are necessary.
+
+define i1 @icmp_eq1(i8 %x) {
+; CHECK-LABEL: @icmp_eq1(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 %x, 6
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp eq i8 %shl, 12
+  ret i1 %cmp
+}
+
+define i1 @icmp_ne1(i8 %x) {
+; CHECK-LABEL: @icmp_ne1(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 %x, -2
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 6
+  %cmp = icmp ne i8 %shl, -128
+  ret i1 %cmp
+}
+
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index 32fe050bf83f..edfa9a102917 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -918,7 +918,7 @@ define i1 @test60_as1(i8 addrspace(1)* %foo, i64 %i, i64 %j) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 %i to i16
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 %j to i16
 ; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nuw i16 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i16 [[TMP2]], [[GEP1_IDX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt i16 [[GEP1_IDX]], [[TMP2]]
 ; CHECK-NEXT:    ret i1 [[TMP3]]
 ;
   %bit = bitcast i8 addrspace(1)* %foo to i32 addrspace(1)*
@@ -949,7 +949,7 @@ define i1 @test60_addrspacecast_smaller(i8* %foo, i16 %i, i64 %j) {
 ; CHECK-LABEL: @test60_addrspacecast_smaller(
 ; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nuw i16 %i, 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 %j to i16
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i16 [[TMP1]], [[GEP1_IDX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt i16 [[GEP1_IDX]], [[TMP1]]
 ; CHECK-NEXT:    ret i1 [[TMP2]]
 ;
   %bit = addrspacecast i8* %foo to i32 addrspace(1)*
@@ -981,7 +981,7 @@ define i1 @test61(i8* %foo, i64 %i, i64 %j) {
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, i32* [[BIT]], i64 %i
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, i8* %foo, i64 %j
 ; CHECK-NEXT:    [[CAST1:%.*]] = bitcast i32* [[GEP1]] to i8*
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8* [[CAST1]], [[GEP2]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8* [[GEP2]], [[CAST1]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %bit = bitcast i8* %foo to i32*
@@ -999,7 +999,7 @@ define i1 @test61_as1(i8 addrspace(1)* %foo, i16 %i, i16 %j) {
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, i32 addrspace(1)* [[BIT]], i16 %i
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, i8 addrspace(1)* %foo, i16 %j
 ; CHECK-NEXT:    [[CAST1:%.*]] = bitcast i32 addrspace(1)* [[GEP1]] to i8 addrspace(1)*
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 addrspace(1)* [[CAST1]], [[GEP2]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 addrspace(1)* [[GEP2]], [[CAST1]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %bit = bitcast i8 addrspace(1)* %foo to i32 addrspace(1)*
@@ -1123,19 +1123,6 @@ define i1 @test68(i32 %x) {
   ret i1 %cmp
 }
 
-; PR14708
-define i1 @test69(i32 %c) {
-; CHECK-LABEL: @test69(
-; CHECK-NEXT:    [[TMP1:%.*]] = or i32 %c, 32
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 97
-; CHECK-NEXT:    ret i1 [[TMP2]]
-;
-  %1 = icmp eq i32 %c, 97
-  %2 = icmp eq i32 %c, 65
-  %3 = or i1 %1, %2
-  ret i1 %3
-}
-
 ; PR15940
 define i1 @test70(i32 %X) {
 ; CHECK-LABEL: @test70(
@@ -1183,12 +1170,11 @@ define i1 @icmp_sext8trunc(i32 %x) {
   ret i1 %cmp
 }
 
-; FIXME: Vectors should fold the same way.
+; Vectors should fold the same way.
 define <2 x i1> @icmp_sext8trunc_vec(<2 x i32> %x) {
 ; CHECK-LABEL: @icmp_sext8trunc_vec(
-; CHECK-NEXT:    [[SEXT1:%.*]] = shl <2 x i32> %x, <i32 24, i32 24>
-; CHECK-NEXT:    [[SEXT:%.*]] = ashr <2 x i32> [[SEXT:%.*]]1, <i32 24, i32 24>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i32> [[SEXT]], <i32 36, i32 36>
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> %x to <2 x i8>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[TMP1]], <i8 36, i8 36>
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %trunc = trunc <2 x i32> %x to <2 x i8>
@@ -1877,6 +1863,55 @@ define <2 x i1> @icmp_and_X_-16_ne-16_vec(<2 x i32> %X) {
   ret <2 x i1> %cmp
 }
 
+; PR32524: https://bugs.llvm.org/show_bug.cgi?id=32524
+; X | C == C --> X <=u C (when C+1 is PowerOf2).
+
+define i1 @or1_eq1(i32 %x) {
+; CHECK-LABEL: @or1_eq1(
+; CHECK-NEXT:    [[T1:%.*]] = icmp ult i32 %x, 2
+; CHECK-NEXT:    ret i1 [[T1]]
+;
+  %t0 = or i32 %x, 1
+  %t1 = icmp eq i32 %t0, 1
+  ret i1 %t1
+}
+
+; X | C == C --> X <=u C (when C+1 is PowerOf2).
+
+define <2 x i1> @or3_eq3_vec(<2 x i8> %x) {
+; CHECK-LABEL: @or3_eq3_vec(
+; CHECK-NEXT:    [[T1:%.*]] = icmp ult <2 x i8> %x, <i8 4, i8 4>
+; CHECK-NEXT:    ret <2 x i1> [[T1]]
+;
+  %t0 = or <2 x i8> %x, <i8 3, i8 3>
+  %t1 = icmp eq <2 x i8> %t0, <i8 3, i8 3>
+  ret <2 x i1> %t1
+}
+
+; X | C != C --> X >u C (when C+1 is PowerOf2).
+
+define i1 @or7_ne7(i32 %x) {
+; CHECK-LABEL: @or7_ne7(
+; CHECK-NEXT:    [[T1:%.*]] = icmp ugt i32 %x, 7
+; CHECK-NEXT:    ret i1 [[T1]]
+;
+  %t0 = or i32 %x, 7
+  %t1 = icmp ne i32 %t0, 7
+  ret i1 %t1
+}
+
+; X | C != C --> X >u C (when C+1 is PowerOf2).
+
+define <2 x i1> @or63_ne63_vec(<2 x i8> %x) {
+; CHECK-LABEL: @or63_ne63_vec(
+; CHECK-NEXT:    [[T1:%.*]] = icmp ugt <2 x i8> %x, <i8 63, i8 63>
+; CHECK-NEXT:    ret <2 x i1> [[T1]]
+;
+  %t0 = or <2 x i8> %x, <i8 63, i8 63>
+  %t1 = icmp ne <2 x i8> %t0, <i8 63, i8 63>
+  ret <2 x i1> %t1
+}
+
 define i1 @shrink_constant(i32 %X) {
 ; CHECK-LABEL: @shrink_constant(
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 %X, -12
@@ -2232,16 +2267,6 @@ define i1 @icmp_sge_zero_add_nsw(i32 %a) {
   ret i1 %cmp
 }
 
-define i1 @icmp_slt_zero_add_nsw(i32 %a) {
-; CHECK-LABEL: @icmp_slt_zero_add_nsw(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 %a, -1
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %add = add nsw i32 %a, 1
-  %cmp = icmp slt i32 %add, 0
-  ret i1 %cmp
-}
-
 define i1 @icmp_sle_zero_add_nsw(i32 %a) {
 ; CHECK-LABEL: @icmp_sle_zero_add_nsw(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 %a, 0
@@ -2425,6 +2450,10 @@ define i1 @f10(i16 %p) {
   ret i1 %cmp580
 }
 
+; Note: fptosi is used in various tests below to ensure that operand complexity
+; canonicalization does not kick in, which would make some of the tests
+; equivalent to one another.
+
 define i1 @cmp_sgt_rhs_dec(float %x, i32 %i) {
 ; CHECK-LABEL: @cmp_sgt_rhs_dec(
 ; CHECK-NEXT:    [[CONV:%.*]] = fptosi float %x to i32
@@ -2711,3 +2740,143 @@ define i1 @or_ptrtoint_mismatch(i8* %p, i32* %q) {
   %b = icmp eq i64 %o, 0
   ret i1 %b
 }
+
+define i1 @icmp_add1_ugt(i32 %x, i32 %y) {
+; CHECK-LABEL: @icmp_add1_ugt(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i32 %x, %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = add nuw i32 %x, 1
+  %cmp = icmp ugt i32 %add, %y
+  ret i1 %cmp
+}
+
+define i1 @icmp_add1_ule(i32 %x, i32 %y) {
+; CHECK-LABEL: @icmp_add1_ule(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 %x, %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = add nuw i32 %x, 1
+  %cmp = icmp ule i32 %add, %y
+  ret i1 %cmp
+}
+
+define i1 @cmp_uge_rhs_inc(float %x, i32 %i) {
+; CHECK-LABEL: @cmp_uge_rhs_inc(
+; CHECK-NEXT:    [[CONV:%.*]] = fptosi float %x to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[CONV]], %i
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %conv = fptosi float %x to i32
+  %inc = add nuw i32 %i, 1
+  %cmp = icmp uge i32 %conv, %inc
+  ret i1 %cmp
+}
+
+define i1 @cmp_ult_rhs_inc(float %x, i32 %i) {
+; CHECK-LABEL: @cmp_ult_rhs_inc(
+; CHECK-NEXT:    [[CONV:%.*]] = fptosi float %x to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i32 [[CONV]], %i
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %conv = fptosi float %x to i32
+  %inc = add nuw i32 %i, 1
+  %cmp = icmp ult i32 %conv, %inc
+  ret i1 %cmp
+}
+
+define i1 @cmp_sge_lhs_inc(i32 %x, i32 %y) {
+; CHECK-LABEL: @cmp_sge_lhs_inc(
+; CHECK-NEXT:    [[INC:%.*]] = add
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[INC]], %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %inc = add nsw i32 %x, 1
+  %cmp = icmp sge i32 %inc, %y
+  ret i1 %cmp
+}
+
+define i1 @cmp_uge_lhs_inc(i32 %x, i32 %y) {
+; CHECK-LABEL: @cmp_uge_lhs_inc(
+; CHECK-NEXT:    [[INC:%.*]] = add
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i32 [[INC]], %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %inc = add nuw i32 %x, 1
+  %cmp = icmp uge i32 %inc, %y
+  ret i1 %cmp
+}
+
+define i1 @cmp_sgt_lhs_dec(i32 %x, i32 %y) {
+; CHECK-LABEL: @cmp_sgt_lhs_dec(
+; CHECK-NEXT:    [[DEC:%.*]] = {{add|sub}}
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[DEC]], %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %dec = sub nsw i32 %x, 1
+  %cmp = icmp sgt i32 %dec, %y
+  ret i1 %cmp
+}
+
+define i1 @cmp_ugt_lhs_dec(i32 %x, i32 %y) {
+; CHECK-LABEL: @cmp_ugt_lhs_dec(
+; CHECK-NEXT:    [[DEC:%.*]] = {{add|sub}}
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[DEC]], %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %dec = sub nuw i32 %x, 1
+  %cmp = icmp ugt i32 %dec, %y
+  ret i1 %cmp
+}
+
+define i1 @cmp_sle_rhs_inc(float %x, i32 %y) {
+; CHECK-LABEL: @cmp_sle_rhs_inc(
+; CHECK-NEXT:    [[CONV:%.*]] = fptosi float %x to i32
+; CHECK-NEXT:    [[INC:%.*]] = add
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[INC]], [[CONV]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %conv = fptosi float %x to i32
+  %inc = add nsw i32 %y, 1
+  %cmp = icmp sle i32 %conv, %inc
+  ret i1 %cmp
+}
+
+define i1 @cmp_ule_rhs_inc(float %x, i32 %y) {
+; CHECK-LABEL: @cmp_ule_rhs_inc(
+; CHECK-NEXT:    [[CONV:%.*]] = fptosi float %x to i32
+; CHECK-NEXT:    [[INC:%.*]] = add
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i32 [[INC]], [[CONV]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %conv = fptosi float %x to i32
+  %inc = add nuw i32 %y, 1
+  %cmp = icmp ule i32 %conv, %inc
+  ret i1 %cmp
+}
+
+define i1 @cmp_slt_rhs_dec(float %x, i32 %y) {
+; CHECK-LABEL: @cmp_slt_rhs_dec(
+; CHECK-NEXT:    [[CONV:%.*]] = fptosi float %x to i32
+; CHECK-NEXT:    [[DEC:%.*]] = {{add|sub}}
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[DEC]], [[CONV]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %conv = fptosi float %x to i32
+  %dec = sub nsw i32 %y, 1
+  %cmp = icmp slt i32 %conv, %dec
+  ret i1 %cmp
+}
+
+define i1 @cmp_ult_rhs_dec(float %x, i32 %y) {
+; CHECK-LABEL: @cmp_ult_rhs_dec(
+; CHECK-NEXT:    [[CONV:%.*]] = fptosi float %x to i32
+; CHECK-NEXT:    [[DEC:%.*]] = {{add|sub}}
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[DEC]], [[CONV]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %conv = fptosi float %x to i32
+  %dec = sub nuw i32 %y, 1
+  %cmp = icmp ult i32 %conv, %dec
+  ret i1 %cmp
+}
diff --git a/test/Transforms/InstCombine/insert-extract-shuffle.ll b/test/Transforms/InstCombine/insert-extract-shuffle.ll
index 4507deb7f023..29f774c5f62b 100644
--- a/test/Transforms/InstCombine/insert-extract-shuffle.ll
+++ b/test/Transforms/InstCombine/insert-extract-shuffle.ll
@@ -86,11 +86,8 @@ define <8 x float> @widen_extract4(<8 x float> %ins, <2 x float> %ext) {
 
 define <8 x i16> @pr26015(<4 x i16> %t0) {
 ; CHECK-LABEL: @pr26015(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> %t0, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> %t0, i32 2
-; CHECK-NEXT:    [[T2:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 undef, i16 undef>, i16 [[TMP2]], i32 3
-; CHECK-NEXT:    [[T3:%.*]] = insertelement <8 x i16> [[T2]], i16 0, i32 6
-; CHECK-NEXT:    [[T5:%.*]] = shufflevector <8 x i16> [[T3]], <8 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 11>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> %t0, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[T5:%.*]] = shufflevector <8 x i16> <i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 0, i16 undef>, <8 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 10, i32 4, i32 5, i32 6, i32 11>
 ; CHECK-NEXT:    ret <8 x i16> [[T5]]
 ;
   %t1 = extractelement <4 x i16> %t0, i32 2
@@ -110,8 +107,7 @@ define <8 x i16> @pr25999(<4 x i16> %t0, i1 %b) {
 ; CHECK-NEXT:    br i1 %b, label %if, label %end
 ; CHECK:       if:
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> %t0, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[T2:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 undef, i16 undef>, i16 [[T1]], i32 3
-; CHECK-NEXT:    [[T3:%.*]] = insertelement <8 x i16> [[T2]], i16 0, i32 6
+; CHECK-NEXT:    [[T3:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 0, i16 undef>, i16 [[T1]], i32 3
 ; CHECK-NEXT:    [[T5:%.*]] = shufflevector <8 x i16> [[T3]], <8 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 11>
 ; CHECK-NEXT:    ret <8 x i16> [[T5]]
 ; CHECK:       end:
diff --git a/test/Transforms/InstCombine/intrinsics.ll b/test/Transforms/InstCombine/intrinsics.ll
index 858f9c029b30..e8f5ddd329ff 100644
--- a/test/Transforms/InstCombine/intrinsics.ll
+++ b/test/Transforms/InstCombine/intrinsics.ll
@@ -351,33 +351,12 @@ define void @ctpop_cmp_vec(<2 x i32> %a, <2 x i1>* %b) {
 ; CHECK-NEXT: store volatile <2 x i1> %pop1.cmp, <2 x i1>* %b
 }
 
-define i32 @cttz_simplify1a(i32 %x) nounwind readnone ssp {
-  %tmp1 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false)
-  %shr3 = lshr i32 %tmp1, 5
-  ret i32 %shr3
-
-; CHECK-LABEL: @cttz_simplify1a(
-; CHECK: icmp eq i32 %x, 0
-; CHECK-NEXT: zext i1
-; CHECK-NEXT: ret i32
-}
-
-define i32 @cttz_simplify1b(i32 %x) nounwind readnone ssp {
-  %tmp1 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
-  %shr3 = lshr i32 %tmp1, 5
-  ret i32 %shr3
-
-; CHECK-LABEL: @cttz_simplify1b(
-; CHECK-NEXT: ret i32 0
-}
-
-define i32 @ctlz_undef(i32 %Value) nounwind {
+define i32 @ctlz_undef(i32 %Value) {
 ; CHECK-LABEL: @ctlz_undef(
 ; CHECK-NEXT:    ret i32 undef
 ;
   %ctlz = call i32 @llvm.ctlz.i32(i32 0, i1 true)
   ret i32 %ctlz
-
 }
 
 define i32 @ctlz_make_undef(i32 %a) {
diff --git a/test/Transforms/InstCombine/lifetime-asan.ll b/test/Transforms/InstCombine/lifetime-asan.ll
index f52c0202b773..7fdc1fcbc3b3 100644
--- a/test/Transforms/InstCombine/lifetime-asan.ll
+++ b/test/Transforms/InstCombine/lifetime-asan.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 declare void @foo(i8* nocapture)
 
 define void @asan() sanitize_address {
@@ -9,8 +9,8 @@ entry:
   ; CHECK-LABEL: @asan(
   %text = alloca i8, align 1
 
-  call void @llvm.lifetime.start(i64 1, i8* %text)
-  call void @llvm.lifetime.end(i64 1, i8* %text)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %text)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %text)
   ; CHECK: call void @llvm.lifetime.start
   ; CHECK-NEXT: call void @llvm.lifetime.end
 
@@ -25,8 +25,8 @@ entry:
   ; CHECK-LABEL: @no_asan(
   %text = alloca i8, align 1
 
-  call void @llvm.lifetime.start(i64 1, i8* %text)
-  call void @llvm.lifetime.end(i64 1, i8* %text)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %text)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %text)
   ; CHECK-NO: call void @llvm.lifetime
 
   call void @foo(i8* %text) ; Keep alloca alive
diff --git a/test/Transforms/InstCombine/lifetime.ll b/test/Transforms/InstCombine/lifetime.ll
index c296d29b99b9..71c676233b08 100644
--- a/test/Transforms/InstCombine/lifetime.ll
+++ b/test/Transforms/InstCombine/lifetime.ll
@@ -1,8 +1,8 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
 declare void @llvm.dbg.declare(metadata, metadata, metadata)
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 declare void @foo(i8* nocapture, i8* nocapture)
 
 define void @bar(i1 %flag) !dbg !4 {
@@ -17,11 +17,11 @@ entry:
 ; CHECK: bb3:
 ; CHECK-NEXT: call void @llvm.dbg.declare
 ; CHECK-NEXT: br label %fin
-; CHECK: call void @llvm.lifetime.start(i64 1, i8* %[[T]])
-; CHECK-NEXT: call void @llvm.lifetime.start(i64 1, i8* %[[B]])
+; CHECK: call void @llvm.lifetime.start.p0i8(i64 1, i8* %[[T]])
+; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 1, i8* %[[B]])
 ; CHECK-NEXT: call void @foo(i8* %[[B]], i8* %[[T]])
-; CHECK-NEXT: call void @llvm.lifetime.end(i64 1, i8* %[[B]])
-; CHECK-NEXT: call void @llvm.lifetime.end(i64 1, i8* %[[T]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 1, i8* %[[B]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 1, i8* %[[T]])
   %text = alloca [1 x i8], align 1
   %buff = alloca [1 x i8], align 1
   %0 = getelementptr inbounds [1 x i8], [1 x i8]* %text, i64 0, i64 0
@@ -29,31 +29,31 @@ entry:
   br i1 %flag, label %if, label %else
 
 if:
-  call void @llvm.lifetime.start(i64 1, i8* %0)
-  call void @llvm.lifetime.start(i64 1, i8* %1)
-  call void @llvm.lifetime.end(i64 1, i8* %1)
-  call void @llvm.lifetime.end(i64 1, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %0)
   br label %bb2
 
 bb2:
-  call void @llvm.lifetime.start(i64 1, i8* %0)
-  call void @llvm.lifetime.start(i64 1, i8* %1)
-  call void @llvm.lifetime.end(i64 1, i8* %0)
-  call void @llvm.lifetime.end(i64 1, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %1)
   br label %bb3
 
 bb3:
-  call void @llvm.lifetime.start(i64 1, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %0)
   call void @llvm.dbg.declare(metadata [1 x i8]* %text, metadata !14, metadata !25), !dbg !26
-  call void @llvm.lifetime.end(i64 1, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %0)
   br label %fin
 
 else:
-  call void @llvm.lifetime.start(i64 1, i8* %0)
-  call void @llvm.lifetime.start(i64 1, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %1)
   call void @foo(i8* %1, i8* %0)
-  call void @llvm.lifetime.end(i64 1, i8* %1)
-  call void @llvm.lifetime.end(i64 1, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %0)
   br  label %fin
 
 fin:
diff --git a/test/Transforms/InstCombine/load-cmp.ll b/test/Transforms/InstCombine/load-cmp.ll
index 75952e01c19c..5746b7aa28d5 100644
--- a/test/Transforms/InstCombine/load-cmp.ll
+++ b/test/Transforms/InstCombine/load-cmp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -instcombine -S -default-data-layout="p:32:32:32-p1:16:16:16-n8:16:32:64" < %s | FileCheck %s
+; RUN: opt -instcombine -S -data-layout="p:32:32:32-p1:16:16:16-n8:16:32:64" < %s | FileCheck %s
 
 @G16 = internal constant [10 x i16] [i16 35, i16 82, i16 69, i16 81, i16 85,
                                      i16 73, i16 82, i16 69, i16 68, i16 0]
diff --git a/test/Transforms/InstCombine/lshr.ll b/test/Transforms/InstCombine/lshr.ll
new file mode 100644
index 000000000000..b81371b03042
--- /dev/null
+++ b/test/Transforms/InstCombine/lshr.ll
@@ -0,0 +1,102 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
+declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+declare <2 x i8> @llvm.cttz.v2i8(<2 x i8>, i1) nounwind readnone
+declare <2 x i8> @llvm.ctlz.v2i8(<2 x i8>, i1) nounwind readnone
+declare <2 x i8> @llvm.ctpop.v2i8(<2 x i8>) nounwind readnone
+
+define i32 @lshr_ctlz_zero_is_not_undef(i32 %x) {
+; CHECK-LABEL: @lshr_ctlz_zero_is_not_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 %x, 0
+; CHECK-NEXT:    [[SH:%.*]] = zext i1 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[SH]]
+;
+  %ct = call i32 @llvm.ctlz.i32(i32 %x, i1 false)
+  %sh = lshr i32 %ct, 5
+  ret i32 %sh
+}
+
+define i32 @lshr_cttz_zero_is_not_undef(i32 %x) {
+; CHECK-LABEL: @lshr_cttz_zero_is_not_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 %x, 0
+; CHECK-NEXT:    [[SH:%.*]] = zext i1 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[SH]]
+;
+  %ct = call i32 @llvm.cttz.i32(i32 %x, i1 false)
+  %sh = lshr i32 %ct, 5
+  ret i32 %sh
+}
+
+define i32 @lshr_ctpop(i32 %x) {
+; CHECK-LABEL: @lshr_ctpop(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 %x, -1
+; CHECK-NEXT:    [[SH:%.*]] = zext i1 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[SH]]
+;
+  %ct = call i32 @llvm.ctpop.i32(i32 %x)
+  %sh = lshr i32 %ct, 5
+  ret i32 %sh
+}
+
+define <2 x i8> @lshr_ctlz_zero_is_not_undef_splat_vec(<2 x i8> %x) {
+; CHECK-LABEL: @lshr_ctlz_zero_is_not_undef_splat_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> %x, zeroinitializer
+; CHECK-NEXT:    [[SH:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i8>
+; CHECK-NEXT:    ret <2 x i8> [[SH]]
+;
+  %ct = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %x, i1 false)
+  %sh = lshr <2 x i8> %ct, <i8 3, i8 3>
+  ret <2 x i8> %sh
+}
+
+define <2 x i8> @lshr_cttz_zero_is_not_undef_splat_vec(<2 x i8> %x) {
+; CHECK-LABEL: @lshr_cttz_zero_is_not_undef_splat_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> %x, zeroinitializer
+; CHECK-NEXT:    [[SH:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i8>
+; CHECK-NEXT:    ret <2 x i8> [[SH]]
+;
+  %ct = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %x, i1 false)
+  %sh = lshr <2 x i8> %ct, <i8 3, i8 3>
+  ret <2 x i8> %sh
+}
+
+define <2 x i8> @lshr_ctpop_splat_vec(<2 x i8> %x) {
+; CHECK-LABEL: @lshr_ctpop_splat_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> %x, <i8 -1, i8 -1>
+; CHECK-NEXT:    [[SH:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i8>
+; CHECK-NEXT:    ret <2 x i8> [[SH]]
+;
+  %ct = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %x)
+  %sh = lshr <2 x i8> %ct, <i8 3, i8 3>
+  ret <2 x i8> %sh
+}
+
+define i8 @lshr_exact(i8 %x) {
+; CHECK-LABEL: @lshr_exact(
+; CHECK-NEXT:    [[SHL:%.*]] = shl i8 %x, 2
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[SHL]], 4
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr exact i8 [[ADD]], 2
+; CHECK-NEXT:    ret i8 [[LSHR]]
+;
+  %shl = shl i8 %x, 2
+  %add = add i8 %shl, 4
+  %lshr = lshr i8 %add, 2
+  ret i8 %lshr
+}
+
+define <2 x i8> @lshr_exact_splat_vec(<2 x i8> %x) {
+; CHECK-LABEL: @lshr_exact_splat_vec(
+; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i8> %x, <i8 2, i8 2>
+; CHECK-NEXT:    [[ADD:%.*]] = add <2 x i8> [[SHL]], <i8 4, i8 4>
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr exact <2 x i8> [[ADD]], <i8 2, i8 2>
+; CHECK-NEXT:    ret <2 x i8> [[LSHR]]
+;
+  %shl = shl <2 x i8> %x, <i8 2, i8 2>
+  %add = add <2 x i8> %shl, <i8 4, i8 4>
+  %lshr = lshr <2 x i8> %add, <i8 2, i8 2>
+  ret <2 x i8> %lshr
+}
+
diff --git a/test/Transforms/InstCombine/malloc-free-delete.ll b/test/Transforms/InstCombine/malloc-free-delete.ll
index 8fcb8214360d..7a5c7457e364 100644
--- a/test/Transforms/InstCombine/malloc-free-delete.ll
+++ b/test/Transforms/InstCombine/malloc-free-delete.ll
@@ -24,8 +24,8 @@ define i1 @foo() {
   ret i1 %z
 }
 
-declare void @llvm.lifetime.start(i64, i8*)
-declare void @llvm.lifetime.end(i64, i8*)
+declare void @llvm.lifetime.start.p0i8(i64, i8*)
+declare void @llvm.lifetime.end.p0i8(i64, i8*)
 declare i64 @llvm.objectsize.i64(i8*, i1)
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
 declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
@@ -35,8 +35,8 @@ define void @test3(i8* %src) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT: ret void
   %a = call noalias i8* @malloc(i32 10)
-  call void @llvm.lifetime.start(i64 10, i8* %a)
-  call void @llvm.lifetime.end(i64 10, i8* %a)
+  call void @llvm.lifetime.start.p0i8(i64 10, i8* %a)
+  call void @llvm.lifetime.end.p0i8(i64 10, i8* %a)
   %size = call i64 @llvm.objectsize.i64(i8* %a, i1 true)
   store i8 42, i8* %a
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a, i8* %src, i32 32, i32 1, i1 false)
diff --git a/test/Transforms/InstCombine/max-of-nots.ll b/test/Transforms/InstCombine/max-of-nots.ll
index 96fac5228970..519f1c6a90b0 100644
--- a/test/Transforms/InstCombine/max-of-nots.ll
+++ b/test/Transforms/InstCombine/max-of-nots.ll
@@ -90,6 +90,28 @@ define i32 @max_of_nots(i32 %x, i32 %y) {
   ret i32 %smax96
 }
 
+ ; negative test case (i.e. can not simplify) : ABS(MIN(NOT x,y))
+define i32 @abs_of_min_of_not(i32 %x, i32 %y) {
+; CHECK-LABEL: @abs_of_min_of_not(
+; CHECK-NEXT: xor
+; CHECK-NEXT: add
+; CHECK-NEXT: icmp sge
+; CHECK-NEXT: select
+; CHECK-NEXT: icmp sgt
+; CHECK-NEXT: sub
+; CHECK-NEXT: select
+; CHECK-NEXT: ret
+
+  %xord = xor i32 %x, -1
+  %yadd = add i32 %y, 2
+  %cond.i = icmp sge i32 %yadd, %xord
+  %min = select i1 %cond.i, i32 %xord, i32 %yadd
+  %cmp2 = icmp sgt i32 %min, -1
+  %sub = sub i32 0, %min
+  %abs = select i1 %cmp2, i32 %min, i32 %sub
+  ret i32  %abs
+}
+
 define <2 x i32> @max_of_nots_vec(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @max_of_nots_vec(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <2 x i32> %y, zeroinitializer
diff --git a/test/Transforms/InstCombine/memcmp-1.ll b/test/Transforms/InstCombine/memcmp-1.ll
index f9ff479e3add..96516f44e081 100644
--- a/test/Transforms/InstCombine/memcmp-1.ll
+++ b/test/Transforms/InstCombine/memcmp-1.ll
@@ -14,67 +14,76 @@ declare i32 @memcmp(i8*, i8*, i32)
 
 define i32 @test_simplify1(i8* %mem, i32 %size) {
 ; CHECK-LABEL: @test_simplify1(
+; CHECK-NEXT:    ret i32 0
+;
   %ret = call i32 @memcmp(i8* %mem, i8* %mem, i32 %size)
   ret i32 %ret
-; CHECK: ret i32 0
 }
 
 ; Check memcmp(mem1, mem2, 0) -> 0.
 
 define i32 @test_simplify2(i8* %mem1, i8* %mem2) {
 ; CHECK-LABEL: @test_simplify2(
+; CHECK-NEXT:    ret i32 0
+;
   %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 0)
   ret i32 %ret
-; CHECK: ret i32 0
 }
 
 ;; Check memcmp(mem1, mem2, 1) -> *(unsigned char*)mem1 - *(unsigned char*)mem2.
 
 define i32 @test_simplify3(i8* %mem1, i8* %mem2) {
 ; CHECK-LABEL: @test_simplify3(
+; CHECK-NEXT:    [[LHSC:%.*]] = load i8, i8* %mem1, align 1
+; CHECK-NEXT:    [[LHSV:%.*]] = zext i8 [[LHSC]] to i32
+; CHECK-NEXT:    [[RHSC:%.*]] = load i8, i8* %mem2, align 1
+; CHECK-NEXT:    [[RHSV:%.*]] = zext i8 [[RHSC]] to i32
+; CHECK-NEXT:    [[CHARDIFF:%.*]] = sub nsw i32 [[LHSV]], [[RHSV]]
+; CHECK-NEXT:    ret i32 [[CHARDIFF]]
+;
   %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 1)
-; CHECK: [[LOAD1:%[a-z]+]] = load i8, i8* %mem1, align 1
-; CHECK: [[ZEXT1:%[a-z]+]] = zext i8 [[LOAD1]] to i32
-; CHECK: [[LOAD2:%[a-z]+]] = load i8, i8* %mem2, align 1
-; CHECK: [[ZEXT2:%[a-z]+]] = zext i8 [[LOAD2]] to i32
-; CHECK: [[RET:%[a-z]+]] = sub nsw i32 [[ZEXT1]], [[ZEXT2]]
   ret i32 %ret
-; CHECK: ret i32 [[RET]]
 }
 
 ; Check memcmp(mem1, mem2, size) -> cnst, where all arguments are constants.
 
 define i32 @test_simplify4() {
 ; CHECK-LABEL: @test_simplify4(
+; CHECK-NEXT:    ret i32 0
+;
   %mem1 = getelementptr [4 x i8], [4 x i8]* @hel, i32 0, i32 0
   %mem2 = getelementptr [8 x i8], [8 x i8]* @hello_u, i32 0, i32 0
   %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 3)
   ret i32 %ret
-; CHECK: ret i32 0
 }
 
 define i32 @test_simplify5() {
 ; CHECK-LABEL: @test_simplify5(
+; CHECK-NEXT:    ret i32 1
+;
   %mem1 = getelementptr [4 x i8], [4 x i8]* @hel, i32 0, i32 0
   %mem2 = getelementptr [4 x i8], [4 x i8]* @foo, i32 0, i32 0
   %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 3)
   ret i32 %ret
-; CHECK: ret i32 1
 }
 
 define i32 @test_simplify6() {
 ; CHECK-LABEL: @test_simplify6(
+; CHECK-NEXT:    ret i32 -1
+;
   %mem1 = getelementptr [4 x i8], [4 x i8]* @foo, i32 0, i32 0
   %mem2 = getelementptr [4 x i8], [4 x i8]* @hel, i32 0, i32 0
   %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 3)
   ret i32 %ret
-; CHECK: ret i32 -1
 }
 
 ; Check memcmp(mem1, mem2, 8)==0 -> *(int64_t*)mem1 == *(int64_t*)mem2
 
 define i1 @test_simplify7(i64 %x, i64 %y) {
 ; CHECK-LABEL: @test_simplify7(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 %x, %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
   %x.addr = alloca i64, align 8
   %y.addr = alloca i64, align 8
   store i64 %x, i64* %x.addr, align 8
@@ -84,14 +93,15 @@ define i1 @test_simplify7(i64 %x, i64 %y) {
   %call = call i32 @memcmp(i8* %xptr, i8* %yptr, i32 8)
   %cmp = icmp eq i32 %call, 0
   ret i1 %cmp
-; CHECK: %cmp = icmp eq i64 %x, %y
-; CHECK: ret i1 %cmp
 }
 
 ; Check memcmp(mem1, mem2, 4)==0 -> *(int32_t*)mem1 == *(int32_t*)mem2
 
 define i1 @test_simplify8(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test_simplify8(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 %x, %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
   %x.addr = alloca i32, align 4
   %y.addr = alloca i32, align 4
   store i32 %x, i32* %x.addr, align 4
@@ -101,14 +111,15 @@ define i1 @test_simplify8(i32 %x, i32 %y) {
   %call = call i32 @memcmp(i8* %xptr, i8* %yptr, i32 4)
   %cmp = icmp eq i32 %call, 0
   ret i1 %cmp
-; CHECK: %cmp = icmp eq i32 %x, %y
-; CHECK: ret i1 %cmp
 }
 
 ; Check memcmp(mem1, mem2, 2)==0 -> *(int16_t*)mem1 == *(int16_t*)mem2
 
 define i1 @test_simplify9(i16 %x, i16 %y) {
 ; CHECK-LABEL: @test_simplify9(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 %x, %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
   %x.addr = alloca i16, align 2
   %y.addr = alloca i16, align 2
   store i16 %x, i16* %x.addr, align 2
@@ -118,6 +129,4 @@ define i1 @test_simplify9(i16 %x, i16 %y) {
   %call = call i32 @memcmp(i8* %xptr, i8* %yptr, i32 2)
   %cmp = icmp eq i32 %call, 0
   ret i1 %cmp
-; CHECK: %cmp = icmp eq i16 %x, %y
-; CHECK: ret i1 %cmp
 }
diff --git a/test/Transforms/InstCombine/memcpy-addrspace.ll b/test/Transforms/InstCombine/memcpy-addrspace.ll
new file mode 100644
index 000000000000..17bc1d08f986
--- /dev/null
+++ b/test/Transforms/InstCombine/memcpy-addrspace.ll
@@ -0,0 +1,85 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+@test.data = private unnamed_addr addrspace(2) constant [8 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7], align 4
+
+; CHECK-LABEL: test_load
+; CHECK: %[[GEP:.*]] = getelementptr [8 x i32], [8 x i32] addrspace(2)* @test.data, i64 0, i64 %x
+; CHECK: %{{.*}} = load i32, i32 addrspace(2)* %[[GEP]]
+; CHECK-NOT: alloca
+; CHECK-NOT: call void @llvm.memcpy.p0i8.p2i8.i64
+; CHECK-NOT: addrspacecast
+; CHECK-NOT: load i32, i32*
+define void @test_load(i32 addrspace(1)* %out, i64 %x) {
+entry:
+  %data = alloca [8 x i32], align 4
+  %0 = bitcast [8 x i32]* %data to i8*
+  call void @llvm.memcpy.p0i8.p2i8.i64(i8* %0, i8 addrspace(2)* bitcast ([8 x i32] addrspace(2)* @test.data to i8 addrspace(2)*), i64 32, i32 4, i1 false)
+  %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* %data, i64 0, i64 %x
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %x
+  store i32 %1, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_load_bitcast_chain
+; CHECK: %[[GEP:.*]] = getelementptr [8 x i32], [8 x i32] addrspace(2)* @test.data, i64 0, i64 %x
+; CHECK: %{{.*}} = load i32, i32 addrspace(2)* %[[GEP]]
+; CHECK-NOT: alloca
+; CHECK-NOT: call void @llvm.memcpy.p0i8.p2i8.i64
+; CHECK-NOT: addrspacecast
+; CHECK-NOT: load i32, i32*
+define void @test_load_bitcast_chain(i32 addrspace(1)* %out, i64 %x) {
+entry:
+  %data = alloca [8 x i32], align 4
+  %0 = bitcast [8 x i32]* %data to i8*
+  call void @llvm.memcpy.p0i8.p2i8.i64(i8* %0, i8 addrspace(2)* bitcast ([8 x i32] addrspace(2)* @test.data to i8 addrspace(2)*), i64 32, i32 4, i1 false)
+  %1 = bitcast i8* %0 to i32*
+  %arrayidx = getelementptr inbounds i32, i32* %1, i64 %x
+  %2 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %x
+  store i32 %2, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_call
+; CHECK: alloca
+; CHECK: call void @llvm.memcpy.p0i8.p2i8.i64
+; CHECK-NOT: addrspacecast
+; CHECK: call i32 @foo(i32* %{{.*}})
+define void @test_call(i32 addrspace(1)* %out, i64 %x) {
+entry:
+  %data = alloca [8 x i32], align 4
+  %0 = bitcast [8 x i32]* %data to i8*
+  call void @llvm.memcpy.p0i8.p2i8.i64(i8* %0, i8 addrspace(2)* bitcast ([8 x i32] addrspace(2)* @test.data to i8 addrspace(2)*), i64 32, i32 4, i1 false)
+  %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* %data, i64 0, i64 %x
+  %1 = call i32 @foo(i32* %arrayidx)
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %x
+  store i32 %1, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_load_and_call
+; CHECK: alloca
+; CHECK: call void @llvm.memcpy.p0i8.p2i8.i64
+; CHECK: load i32, i32* %{{.*}}
+; CHECK: call i32 @foo(i32* %{{.*}})
+; CHECK-NOT: addrspacecast
+; CHECK-NOT: load i32, i32 addrspace(2)*
+define void @test_load_and_call(i32 addrspace(1)* %out, i64 %x, i64 %y) {
+entry:
+  %data = alloca [8 x i32], align 4
+  %0 = bitcast [8 x i32]* %data to i8*
+  call void @llvm.memcpy.p0i8.p2i8.i64(i8* %0, i8 addrspace(2)* bitcast ([8 x i32] addrspace(2)* @test.data to i8 addrspace(2)*), i64 32, i32 4, i1 false)
+  %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* %data, i64 0, i64 %x
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %x
+  store i32 %1, i32 addrspace(1)* %arrayidx1, align 4
+  %2 = call i32 @foo(i32* %arrayidx)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %y
+  store i32 %2, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+
+declare void @llvm.memcpy.p0i8.p2i8.i64(i8* nocapture writeonly, i8 addrspace(2)* nocapture readonly, i64, i32, i1)
+declare i32 @foo(i32* %x)
diff --git a/test/Transforms/InstCombine/memcpy-from-global.ll b/test/Transforms/InstCombine/memcpy-from-global.ll
index da38087d7397..7c9384d89ba3 100644
--- a/test/Transforms/InstCombine/memcpy-from-global.ll
+++ b/test/Transforms/InstCombine/memcpy-from-global.ll
@@ -126,11 +126,11 @@ define void @test4() {
   ret void
 }
 
-declare void @llvm.lifetime.start(i64, i8*)
+declare void @llvm.lifetime.start.p0i8(i64, i8*)
 define void @test5() {
   %A = alloca %T
   %a = bitcast %T* %A to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %a)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %a)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast (%T* @G to i8*), i64 124, i32 4, i1 false)
   call void @baz(i8* byval %a)
 ; CHECK-LABEL: @test5(
diff --git a/test/Transforms/InstCombine/memcpy-to-load.ll b/test/Transforms/InstCombine/memcpy-to-load.ll
index bcc9e188b965..fe5f0ac657f1 100644
--- a/test/Transforms/InstCombine/memcpy-to-load.ll
+++ b/test/Transforms/InstCombine/memcpy-to-load.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep "load double"
+; RUN: opt < %s -instcombine -S | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i686-apple-darwin8"
 
@@ -10,4 +10,8 @@ entry:
   ret void
 }
 
+; Make sure that the memcpy has been replace with a load/store of i64
+; CHECK: [[TMP:%[0-9]+]] = load i64
+; CHECK: store i64 [[TMP]]
+
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
diff --git a/test/Transforms/InstCombine/memset_chk-1.ll b/test/Transforms/InstCombine/memset_chk-1.ll
index 9d08e96cb49b..79028502b641 100644
--- a/test/Transforms/InstCombine/memset_chk-1.ll
+++ b/test/Transforms/InstCombine/memset_chk-1.ll
@@ -69,7 +69,7 @@ define i32 @test_rauw(i8* %a, i8* %b, i8** %c) {
 entry:
   %call49 = call i64 @strlen(i8* %a)
   %add180 = add i64 %call49, 1
-  %yo107 = call i64 @llvm.objectsize.i64.p0i8(i8* %b, i1 false)
+  %yo107 = call i64 @llvm.objectsize.i64.p0i8(i8* %b, i1 false, i1 false)
   %call50 = call i8* @__memmove_chk(i8* %b, i8* %a, i64 %add180, i64 %yo107)
 ; CHECK: %strlen = call i64 @strlen(i8* %b)
 ; CHECK-NEXT: %strchr2 = getelementptr i8, i8* %b, i64 %strlen
@@ -87,7 +87,7 @@ entry:
 declare i8* @__memmove_chk(i8*, i8*, i64, i64)
 declare i8* @strrchr(i8*, i32)
 declare i64 @strlen(i8* nocapture)
-declare i64 @llvm.objectsize.i64.p0i8(i8*, i1)
+declare i64 @llvm.objectsize.i64.p0i8(i8*, i1, i1)
 
 declare i8* @__memset_chk(i8*, i32, i64, i64)
 
@@ -100,7 +100,7 @@ entry:
   br i1 %cmp, label %cleanup, label %if.end
 if.end:
   %bc = bitcast i8* %call to float*
-  %call2 = tail call i64 @llvm.objectsize.i64.p0i8(i8* nonnull %call, i1 false)
+  %call2 = tail call i64 @llvm.objectsize.i64.p0i8(i8* nonnull %call, i1 false, i1 false)
   %call3 = tail call i8* @__memset_chk(i8* nonnull %call, i32 0, i64 %size, i64 %call2) #1
   br label %cleanup
 cleanup:
@@ -114,7 +114,7 @@ cleanup:
 ; CHECK-NEXT:    br i1 %cmp, label %cleanup, label %if.end
 ; CHECK:       if.end:
 ; CHECK-NEXT:    %bc = bitcast i8* %call to float*
-; CHECK-NEXT:    %call2 = tail call i64 @llvm.objectsize.i64.p0i8(i8* nonnull %call, i1 false)
+; CHECK-NEXT:    %call2 = tail call i64 @llvm.objectsize.i64.p0i8(i8* nonnull %call, i1 false, i1 false)
 ; CHECK-NEXT:    %call3 = tail call i8* @__memset_chk(i8* nonnull %call, i32 0, i64 %size, i64 %call2)
 ; CHECK-NEXT:    br label %cleanup
 ; CHECK:       cleanup:
diff --git a/test/Transforms/InstCombine/minmax-fold.ll b/test/Transforms/InstCombine/minmax-fold.ll
index bf46cefd8ab3..19a7341fdc28 100644
--- a/test/Transforms/InstCombine/minmax-fold.ll
+++ b/test/Transforms/InstCombine/minmax-fold.ll
@@ -339,14 +339,85 @@ define i32 @test75(i32 %x) {
   ret i32 %retval
 }
 
+; The next 4 tests are value clamping with constants:
+; https://llvm.org/bugs/show_bug.cgi?id=31693
+
+; (X <s C1) ? C1 : SMIN(X, C2) ==> SMAX(SMIN(X, C2), C1)
+
+define i32 @clamp_signed1(i32 %x) {
+; CHECK-LABEL: @clamp_signed1(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 %x, 255
+; CHECK-NEXT:    [[MIN:%.*]] = select i1 [[CMP2]], i32 %x, i32 255
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[MIN]], 15
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[TMP1]], i32 [[MIN]], i32 15
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %cmp2 = icmp slt i32 %x, 255
+  %min = select i1 %cmp2, i32 %x, i32 255
+  %cmp1 = icmp slt i32 %x, 15
+  %r = select i1 %cmp1, i32 15, i32 %min
+  ret i32 %r
+}
+
+; (X >s C1) ? C1 : SMAX(X, C2) ==> SMIN(SMAX(X, C2), C1)
+
+define i32 @clamp_signed2(i32 %x) {
+; CHECK-LABEL: @clamp_signed2(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 %x, 15
+; CHECK-NEXT:    [[MAX:%.*]] = select i1 [[CMP2]], i32 %x, i32 15
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[MAX]], 255
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[TMP1]], i32 [[MAX]], i32 255
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %cmp2 = icmp sgt i32 %x, 15
+  %max = select i1 %cmp2, i32 %x, i32 15
+  %cmp1 = icmp sgt i32 %x, 255
+  %r = select i1 %cmp1, i32 255, i32 %max
+  ret i32 %r
+}
+
+; (X <u C1) ? C1 : UMIN(X, C2) ==> UMAX(UMIN(X, C2), C1)
+
+define i32 @clamp_unsigned1(i32 %x) {
+; CHECK-LABEL: @clamp_unsigned1(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 %x, 255
+; CHECK-NEXT:    [[MIN:%.*]] = select i1 [[CMP2]], i32 %x, i32 255
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[MIN]], 15
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[TMP1]], i32 [[MIN]], i32 15
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %cmp2 = icmp ult i32 %x, 255
+  %min = select i1 %cmp2, i32 %x, i32 255
+  %cmp1 = icmp ult i32 %x, 15
+  %r = select i1 %cmp1, i32 15, i32 %min
+  ret i32 %r
+}
+
+; (X >u C1) ? C1 : UMAX(X, C2) ==> UMIN(UMAX(X, C2), C1)
+
+define i32 @clamp_unsigned2(i32 %x) {
+; CHECK-LABEL: @clamp_unsigned2(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i32 %x, 15
+; CHECK-NEXT:    [[MAX:%.*]] = select i1 [[CMP2]], i32 %x, i32 15
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[MAX]], 255
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[TMP1]], i32 [[MAX]], i32 255
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %cmp2 = icmp ugt i32 %x, 15
+  %max = select i1 %cmp2, i32 %x, i32 15
+  %cmp1 = icmp ugt i32 %x, 255
+  %r = select i1 %cmp1, i32 255, i32 %max
+  ret i32 %r
+}
+
 ; The next 3 min tests should canonicalize to the same form...and not infinite loop.
 
 define double @PR31751_umin1(i32 %x) {
 ; CHECK-LABEL: @PR31751_umin1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 %x, 2147483647
-; CHECK-NEXT:    [[CONV1:%.*]] = select i1 [[TMP1]], i32 %x, i32 2147483647
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp i32 [[CONV1]] to double
-; CHECK-NEXT:    ret double [[TMP2]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[TMP1]], i32 %x, i32 2147483647
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[SEL]] to double
+; CHECK-NEXT:    ret double [[CONV]]
 ;
   %cmp = icmp slt i32 %x, 0
   %sel = select i1 %cmp, i32 2147483647, i32 %x
@@ -385,9 +456,9 @@ define double @PR31751_umin3(i32 %x) {
 define double @PR31751_umax1(i32 %x) {
 ; CHECK-LABEL: @PR31751_umax1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 %x, -2147483648
-; CHECK-NEXT:    [[CONV1:%.*]] = select i1 [[TMP1]], i32 %x, i32 -2147483648
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp i32 [[CONV1]] to double
-; CHECK-NEXT:    ret double [[TMP2]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[TMP1]], i32 %x, i32 -2147483648
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[SEL]] to double
+; CHECK-NEXT:    ret double [[CONV]]
 ;
   %cmp = icmp sgt i32 %x, -1
   %sel = select i1 %cmp, i32 2147483648, i32 %x
@@ -420,3 +491,77 @@ define double @PR31751_umax3(i32 %x) {
   %conv = sitofp i32 %sel to double
   ret double %conv
 }
+
+; The icmp/select form a canonical smax, so don't hide that by folding the final bitcast into the select.
+
+define float @bitcast_scalar_smax(float %x, float %y) {
+; CHECK-LABEL: @bitcast_scalar_smax(
+; CHECK-NEXT:    [[BCX:%.*]] = bitcast float %x to i32
+; CHECK-NEXT:    [[BCY:%.*]] = bitcast float %y to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[BCX]], [[BCY]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[BCX]], i32 [[BCY]]
+; CHECK-NEXT:    [[BCS:%.*]] = bitcast i32 [[SEL]] to float
+; CHECK-NEXT:    ret float [[BCS]]
+;
+  %bcx = bitcast float %x to i32
+  %bcy = bitcast float %y to i32
+  %cmp = icmp sgt i32 %bcx, %bcy
+  %sel = select i1 %cmp, i32 %bcx, i32 %bcy
+  %bcs = bitcast i32 %sel to float
+  ret float %bcs
+}
+
+; FIXME: Create a canonical umax by bitcasting the select.
+
+define float @bitcast_scalar_umax(float %x, float %y) {
+; CHECK-LABEL: @bitcast_scalar_umax(
+; CHECK-NEXT:    [[BCX:%.*]] = bitcast float %x to i32
+; CHECK-NEXT:    [[BCY:%.*]] = bitcast float %y to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[BCX]], [[BCY]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], float %x, float %y
+; CHECK-NEXT:    ret float [[SEL]]
+;
+  %bcx = bitcast float %x to i32
+  %bcy = bitcast float %y to i32
+  %cmp = icmp ugt i32 %bcx, %bcy
+  %sel = select i1 %cmp, float %x, float %y
+  ret float %sel
+}
+
+; PR32306 - https://bugs.llvm.org/show_bug.cgi?id=32306
+; The icmp/select form a canonical smin, so don't hide that by folding the final bitcast into the select.
+
+define <8 x float> @bitcast_vector_smin(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @bitcast_vector_smin(
+; CHECK-NEXT:    [[BCX:%.*]] = bitcast <8 x float> %x to <8 x i32>
+; CHECK-NEXT:    [[BCY:%.*]] = bitcast <8 x float> %y to <8 x i32>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <8 x i32> [[BCX]], [[BCY]]
+; CHECK-NEXT:    [[SEL:%.*]] = select <8 x i1> [[CMP]], <8 x i32> [[BCX]], <8 x i32> [[BCY]]
+; CHECK-NEXT:    [[BCS:%.*]] = bitcast <8 x i32> [[SEL]] to <8 x float>
+; CHECK-NEXT:    ret <8 x float> [[BCS]]
+;
+  %bcx = bitcast <8 x float> %x to <8 x i32>
+  %bcy = bitcast <8 x float> %y to <8 x i32>
+  %cmp = icmp slt <8 x i32> %bcx, %bcy
+  %sel = select <8 x i1> %cmp, <8 x i32> %bcx, <8 x i32> %bcy
+  %bcs = bitcast <8 x i32> %sel to <8 x float>
+  ret <8 x float> %bcs
+}
+
+; FIXME: Create a canonical umin by bitcasting the select.
+
+define <8 x float> @bitcast_vector_umin(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @bitcast_vector_umin(
+; CHECK-NEXT:    [[BCX:%.*]] = bitcast <8 x float> %x to <8 x i32>
+; CHECK-NEXT:    [[BCY:%.*]] = bitcast <8 x float> %y to <8 x i32>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <8 x i32> [[BCX]], [[BCY]]
+; CHECK-NEXT:    [[SEL:%.*]] = select <8 x i1> [[CMP]], <8 x float> %x, <8 x float> %y
+; CHECK-NEXT:    ret <8 x float> [[SEL]]
+;
+  %bcx = bitcast <8 x float> %x to <8 x i32>
+  %bcy = bitcast <8 x float> %y to <8 x i32>
+  %cmp = icmp slt <8 x i32> %bcx, %bcy
+  %sel = select <8 x i1> %cmp, <8 x float> %x, <8 x float> %y
+  ret <8 x float> %sel
+}
+
diff --git a/test/Transforms/InstCombine/narrow-switch.ll b/test/Transforms/InstCombine/narrow-switch.ll
index c391fd2cd332..474bd820c8f8 100644
--- a/test/Transforms/InstCombine/narrow-switch.ll
+++ b/test/Transforms/InstCombine/narrow-switch.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; Vary legal integer types in data layout.
-; RUN: opt < %s -instcombine -S -default-data-layout=n32    | FileCheck %s --check-prefix=ALL --check-prefix=CHECK32
-; RUN: opt < %s -instcombine -S -default-data-layout=n32:64 | FileCheck %s --check-prefix=ALL --check-prefix=CHECK64
+; RUN: opt < %s -instcombine -S -data-layout=n32    | FileCheck %s --check-prefix=ALL --check-prefix=CHECK32
+; RUN: opt < %s -instcombine -S -data-layout=n32:64 | FileCheck %s --check-prefix=ALL --check-prefix=CHECK64
 
 ; In all cases, the data-layout is irrelevant. We should shrink as much as possible in InstCombine
 ; and allow the backend to expand as much as needed to ensure optimal codegen for any target.
@@ -164,3 +164,45 @@ case124:
   ret i8 5
 }
 
+; Make sure the arithmetic evaluation of the switch
+; condition is evaluated on the original type
+define i32 @trunc32to16(i32 %a0) #0 {
+; ALL-LABEL: @trunc32to16(
+; ALL:         switch i16
+; ALL-NEXT:    i16 63, label %sw.bb
+; ALL-NEXT:    i16 1, label %sw.bb1
+; ALL-NEXT:    i16 100, label %sw.bb2
+; ALL-NEXT:    ]
+;
+entry:
+  %retval = alloca i32, align 4
+  %xor = xor i32 %a0, 1034460917
+  %shr = lshr i32 %xor, 16
+  %add = add i32 %shr, -917677090
+  switch i32 %add, label %sw.epilog [
+    i32 -917677027, label %sw.bb
+    i32 -917677089, label %sw.bb1
+    i32 -917676990, label %sw.bb2
+  ]
+
+sw.bb:                                            ; preds = %entry
+  store i32 90, i32* %retval, align 4
+  br label %return
+
+sw.bb1:                                           ; preds = %entry
+  store i32 91, i32* %retval, align 4
+  br label %return
+
+sw.bb2:                                           ; preds = %entry
+  store i32 92, i32* %retval, align 4
+  br label %return
+
+sw.epilog:                                        ; preds = %entry
+  store i32 113, i32* %retval, align 4
+  br label %return
+
+return:                                           ; preds = %sw.epilog, %sw.bb2,
+  %rval = load i32, i32* %retval, align 4
+  ret i32 %rval
+}
+
diff --git a/test/Transforms/InstCombine/narrow.ll b/test/Transforms/InstCombine/narrow.ll
index 0e000e8bdbeb..1df400aac973 100644
--- a/test/Transforms/InstCombine/narrow.ll
+++ b/test/Transforms/InstCombine/narrow.ll
@@ -97,3 +97,143 @@ define <2 x i32> @shrink_and_vec(<2 x i33> %a) {
   ret <2 x i32> %trunc
 }
 
+; FIXME:
+; This is based on an 'any_of' loop construct.
+; By narrowing the phi and logic op, we simplify away the zext and the final icmp.
+
+define i1 @searchArray1(i32 %needle, i32* %haystack) {
+; CHECK-LABEL: @searchArray1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[FOUND:%.*]] = phi i8 [ 0, [[ENTRY]] ], [ [[OR:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[INDVAR]] to i64
+; CHECK-NEXT:    [[IDX:%.*]] = getelementptr i32, i32* [[HAYSTACK:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[LD:%.*]] = load i32, i32* [[IDX]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[LD]], [[NEEDLE:%.*]]
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP1]] to i8
+; CHECK-NEXT:    [[OR]] = or i8 [[FOUND]], [[ZEXT]]
+; CHECK-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVAR_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i8 [[OR]], 0
+; CHECK-NEXT:    ret i1 [[TOBOOL]]
+;
+entry:
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ 0, %entry ], [ %indvar.next, %loop ]
+  %found = phi i8 [ 0, %entry ], [ %or, %loop ]
+  %idx = getelementptr i32, i32* %haystack, i32 %indvar
+  %ld = load i32, i32* %idx
+  %cmp1 = icmp eq i32 %ld, %needle
+  %zext = zext i1 %cmp1 to i8
+  %or = or i8 %found, %zext
+  %indvar.next = add i32 %indvar, 1
+  %exitcond = icmp eq i32 %indvar.next, 1000
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  %tobool = icmp ne i8 %or, 0
+  ret i1 %tobool
+}
+
+; FIXME:
+; This is based on an 'all_of' loop construct.
+; By narrowing the phi and logic op, we simplify away the zext and the final icmp.
+
+define i1 @searchArray2(i32 %hay, i32* %haystack) {
+; CHECK-LABEL: @searchArray2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[FOUND:%.*]] = phi i8 [ 1, [[ENTRY]] ], [ [[AND:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IDX:%.*]] = getelementptr i32, i32* [[HAYSTACK:%.*]], i64 [[INDVAR]]
+; CHECK-NEXT:    [[LD:%.*]] = load i32, i32* [[IDX]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[LD]], [[HAY:%.*]]
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP1]] to i8
+; CHECK-NEXT:    [[AND]] = and i8 [[FOUND]], [[ZEXT]]
+; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i8 [[AND]], 0
+; CHECK-NEXT:    ret i1 [[TOBOOL]]
+;
+entry:
+  br label %loop
+
+loop:
+  %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %loop ]
+  %found = phi i8 [ 1, %entry ], [ %and, %loop ]
+  %idx = getelementptr i32, i32* %haystack, i64 %indvar
+  %ld = load i32, i32* %idx
+  %cmp1 = icmp eq i32 %ld, %hay
+  %zext = zext i1 %cmp1 to i8
+  %and = and i8 %found, %zext
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, 1000
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  %tobool = icmp ne i8 %and, 0
+  ret i1 %tobool
+}
+
+; FIXME:
+; Narrowing should work with an 'xor' and is not limited to bool types.
+
+define i32 @shrinkLogicAndPhi1(i8 %x, i1 %cond) {
+; CHECK-LABEL: @shrinkLogicAndPhi1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF:%.*]], label [[ENDIF:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    br label [[ENDIF]]
+; CHECK:       endif:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 21, [[ENTRY:%.*]] ], [ 33, [[IF]] ]
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    [[LOGIC:%.*]] = xor i32 [[PHI]], [[ZEXT]]
+; CHECK-NEXT:    ret i32 [[LOGIC]]
+;
+entry:
+  br i1 %cond, label %if, label %endif
+if:
+  br label %endif
+endif:
+  %phi = phi i32 [ 21, %entry], [ 33, %if ]
+  %zext = zext i8 %x to i32
+  %logic = xor i32 %phi, %zext
+  ret i32 %logic
+}
+
+; FIXME:
+; Narrowing should work with an 'xor' and is not limited to bool types.
+; Test that commuting the xor operands does not inhibit optimization.
+
+define i32 @shrinkLogicAndPhi2(i8 %x, i1 %cond) {
+; CHECK-LABEL: @shrinkLogicAndPhi2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF:%.*]], label [[ENDIF:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    br label [[ENDIF]]
+; CHECK:       endif:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 21, [[ENTRY:%.*]] ], [ 33, [[IF]] ]
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    [[LOGIC:%.*]] = xor i32 [[PHI]], [[ZEXT]]
+; CHECK-NEXT:    ret i32 [[LOGIC]]
+;
+entry:
+  br i1 %cond, label %if, label %endif
+if:
+  br label %endif
+endif:
+  %phi = phi i32 [ 21, %entry], [ 33, %if ]
+  %zext = zext i8 %x to i32
+  %logic = xor i32 %zext, %phi
+  ret i32 %logic
+}
+
diff --git a/test/Transforms/InstCombine/not-fcmp.ll b/test/Transforms/InstCombine/not-fcmp.ll
deleted file mode 100644
index 9718e0b905fc..000000000000
--- a/test/Transforms/InstCombine/not-fcmp.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
-; PR1570
-
-define i1 @f(float %X, float %Y) {
-entry:
-        %tmp3 = fcmp olt float %X, %Y           ; <i1> [#uses=1]
-        %toBoolnot5 = xor i1 %tmp3, true                ; <i1> [#uses=1]
-        ret i1 %toBoolnot5
-; CHECK-LABEL: @f(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: %toBoolnot5 = fcmp uge float %X, %Y
-; CHECK-NEXT: ret i1 %toBoolnot5
-}
diff --git a/test/Transforms/InstCombine/not.ll b/test/Transforms/InstCombine/not.ll
index edb402a125ac..d0c242f65558 100644
--- a/test/Transforms/InstCombine/not.ll
+++ b/test/Transforms/InstCombine/not.ll
@@ -1,61 +1,95 @@
-; This test makes sure that these instructions are properly eliminated.
-;
-
 ; RUN: opt < %s -instcombine -S | FileCheck %s
-; CHECK-NOT: xor
 
 define i32 @test1(i32 %A) {
-        %B = xor i32 %A, -1
-        %C = xor i32 %B, -1
-        ret i32 %C
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    ret i32 %A
+;
+  %B = xor i32 %A, -1
+  %C = xor i32 %B, -1
+  ret i32 %C
 }
 
-define i1 @test2(i32 %A, i32 %B) {
-        ; Can change into setge
-        %cond = icmp sle i32 %A, %B
-        %Ret = xor i1 %cond, true
-        ret i1 %Ret
+define i1 @invert_icmp(i32 %A, i32 %B) {
+; CHECK-LABEL: @invert_icmp(
+; CHECK-NEXT:    [[NOT:%.*]] = icmp sgt i32 %A, %B
+; CHECK-NEXT:    ret i1 [[NOT]]
+;
+  %cmp = icmp sle i32 %A, %B
+  %not = xor i1 %cmp, true
+  ret i1 %not
+}
+
+; PR1570
+
+define i1 @invert_fcmp(float %X, float %Y) {
+; CHECK-LABEL: @invert_fcmp(
+; CHECK-NEXT:    [[NOT:%.*]] = fcmp uge float %X, %Y
+; CHECK-NEXT:    ret i1 [[NOT]]
+;
+  %cmp = fcmp olt float %X, %Y
+  %not = xor i1 %cmp, true
+  ret i1 %not
 }
 
 ; Test that De Morgan's law can be instcombined.
 define i32 @test3(i32 %A, i32 %B) {
-        %a = xor i32 %A, -1
-        %b = xor i32 %B, -1
-        %c = and i32 %a, %b
-        %d = xor i32 %c, -1
-        ret i32 %d
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[C_DEMORGAN:%.*]] = or i32 %A, %B
+; CHECK-NEXT:    ret i32 [[C_DEMORGAN]]
+;
+  %a = xor i32 %A, -1
+  %b = xor i32 %B, -1
+  %c = and i32 %a, %b
+  %d = xor i32 %c, -1
+  ret i32 %d
 }
 
 ; Test that De Morgan's law can work with constants.
 define i32 @test4(i32 %A, i32 %B) {
-        %a = xor i32 %A, -1
-        %c = and i32 %a, 5
-        %d = xor i32 %c, -1
-        ret i32 %d
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[D1:%.*]] = or i32 %A, -6
+; CHECK-NEXT:    ret i32 [[D1]]
+;
+  %a = xor i32 %A, -1
+  %c = and i32 %a, 5
+  %d = xor i32 %c, -1
+  ret i32 %d
 }
 
 ; Test the mirror of De Morgan's law.
 define i32 @test5(i32 %A, i32 %B) {
-        %a = xor i32 %A, -1
-        %b = xor i32 %B, -1
-        %c = or i32 %a, %b
-        %d = xor i32 %c, -1
-        ret i32 %d
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    [[C_DEMORGAN:%.*]] = and i32 %A, %B
+; CHECK-NEXT:    ret i32 [[C_DEMORGAN]]
+;
+  %a = xor i32 %A, -1
+  %b = xor i32 %B, -1
+  %c = or i32 %a, %b
+  %d = xor i32 %c, -1
+  ret i32 %d
 }
 
 ; PR2298
 define zeroext i8 @test6(i32 %a, i32 %b) {
-entry:
-	%tmp1not = xor i32 %a, -1
-	%tmp2not = xor i32 %b, -1
-	%tmp3 = icmp slt i32 %tmp1not, %tmp2not
-	%retval67 = zext i1 %tmp3 to i8
-	ret i8 %retval67
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt i32 %b, %a
+; CHECK-NEXT:    [[RETVAL67:%.*]] = zext i1 [[TMP3]] to i8
+; CHECK-NEXT:    ret i8 [[RETVAL67]]
+;
+  %tmp1not = xor i32 %a, -1
+  %tmp2not = xor i32 %b, -1
+  %tmp3 = icmp slt i32 %tmp1not, %tmp2not
+  %retval67 = zext i1 %tmp3 to i8
+  ret i8 %retval67
 }
 
 define <2 x i1> @test7(<2 x i32> %A, <2 x i32> %B) {
-        %cond = icmp sle <2 x i32> %A, %B
-        %Ret = xor <2 x i1> %cond, <i1 true, i1 true>
-        ret <2 x i1> %Ret
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[RET:%.*]] = icmp sgt <2 x i32> %A, %B
+; CHECK-NEXT:    ret <2 x i1> [[RET]]
+;
+  %cond = icmp sle <2 x i32> %A, %B
+  %Ret = xor <2 x i1> %cond, <i1 true, i1 true>
+  ret <2 x i1> %Ret
 }
 
diff --git a/test/Transforms/InstCombine/nvvm-intrins.ll b/test/Transforms/InstCombine/nvvm-intrins.ll
new file mode 100644
index 000000000000..cb65b8fdc547
--- /dev/null
+++ b/test/Transforms/InstCombine/nvvm-intrins.ll
@@ -0,0 +1,471 @@
+; Check that nvvm intrinsics get simplified to target-generic intrinsics where
+; possible.
+;
+; We run this test twice; once with ftz on, and again with ftz off.  Behold the
+; hackery:
+
+; RUN: cat %s > %t.ftz
+; RUN: echo 'attributes #0 = { "nvptx-f32ftz" = "true" }' >> %t.ftz
+; RUN: opt < %t.ftz -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=FTZ
+
+; RUN: cat %s > %t.noftz
+; RUN: echo 'attributes #0 = { "nvptx-f32ftz" = "false" }' >> %t.noftz
+; RUN: opt < %t.noftz -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=NOFTZ
+
+; We handle nvvm intrinsics with ftz variants as follows:
+;  - If the module is in ftz mode, the ftz variant is transformed into the
+;    regular llvm intrinsic, and the non-ftz variant is left alone.
+;  - If the module is not in ftz mode, it's the reverse: Only the non-ftz
+;    variant is transformed, and the ftz variant is left alone.
+
+; Check NVVM intrinsics that map directly to LLVM target-generic intrinsics.
+
+; CHECK-LABEL: @ceil_double
+define double @ceil_double(double %a) #0 {
+; CHECK: call double @llvm.ceil.f64
+  %ret = call double @llvm.nvvm.ceil.d(double %a)
+  ret double %ret
+}
+; CHECK-LABEL: @ceil_float
+define float @ceil_float(float %a) #0 {
+; NOFTZ: call float @llvm.ceil.f32
+; FTZ: call float @llvm.nvvm.ceil.f
+  %ret = call float @llvm.nvvm.ceil.f(float %a)
+  ret float %ret
+}
+; CHECK-LABEL: @ceil_float_ftz
+define float @ceil_float_ftz(float %a) #0 {
+; NOFTZ: call float @llvm.nvvm.ceil.ftz.f
+; FTZ: call float @llvm.ceil.f32
+  %ret = call float @llvm.nvvm.ceil.ftz.f(float %a)
+  ret float %ret
+}
+
+; CHECK-LABEL: @fabs_double
+define double @fabs_double(double %a) #0 {
+; CHECK: call double @llvm.fabs.f64
+  %ret = call double @llvm.nvvm.fabs.d(double %a)
+  ret double %ret
+}
+; CHECK-LABEL: @fabs_float
+define float @fabs_float(float %a) #0 {
+; NOFTZ: call float @llvm.fabs.f32
+; FTZ: call float @llvm.nvvm.fabs.f
+  %ret = call float @llvm.nvvm.fabs.f(float %a)
+  ret float %ret
+}
+; CHECK-LABEL: @fabs_float_ftz
+define float @fabs_float_ftz(float %a) #0 {
+; NOFTZ: call float @llvm.nvvm.fabs.ftz.f
+; FTZ: call float @llvm.fabs.f32
+  %ret = call float @llvm.nvvm.fabs.ftz.f(float %a)
+  ret float %ret
+}
+
+; CHECK-LABEL: @floor_double
+define double @floor_double(double %a) #0 {
+; CHECK: call double @llvm.floor.f64
+  %ret = call double @llvm.nvvm.floor.d(double %a)
+  ret double %ret
+}
+; CHECK-LABEL: @floor_float
+define float @floor_float(float %a) #0 {
+; NOFTZ: call float @llvm.floor.f32
+; FTZ: call float @llvm.nvvm.floor.f
+  %ret = call float @llvm.nvvm.floor.f(float %a)
+  ret float %ret
+}
+; CHECK-LABEL: @floor_float_ftz
+define float @floor_float_ftz(float %a) #0 {
+; NOFTZ: call float @llvm.nvvm.floor.ftz.f
+; FTZ: call float @llvm.floor.f32
+  %ret = call float @llvm.nvvm.floor.ftz.f(float %a)
+  ret float %ret
+}
+
+; CHECK-LABEL: @fma_double
+define double @fma_double(double %a, double %b, double %c) #0 {
+; CHECK: call double @llvm.fma.f64
+  %ret = call double @llvm.nvvm.fma.rn.d(double %a, double %b, double %c)
+  ret double %ret
+}
+; CHECK-LABEL: @fma_float
+define float @fma_float(float %a, float %b, float %c) #0 {
+; NOFTZ: call float @llvm.fma.f32
+; FTZ: call float @llvm.nvvm.fma.rn.f
+  %ret = call float @llvm.nvvm.fma.rn.f(float %a, float %b, float %c)
+  ret float %ret
+}
+; CHECK-LABEL: @fma_float_ftz
+define float @fma_float_ftz(float %a, float %b, float %c) #0 {
+; NOFTZ: call float @llvm.nvvm.fma.rn.ftz.f
+; FTZ: call float @llvm.fma.f32
+  %ret = call float @llvm.nvvm.fma.rn.ftz.f(float %a, float %b, float %c)
+  ret float %ret
+}
+
+; CHECK-LABEL: @fmax_double
+define double @fmax_double(double %a, double %b) #0 {
+; CHECK: call double @llvm.maxnum.f64
+  %ret = call double @llvm.nvvm.fmax.d(double %a, double %b)
+  ret double %ret
+}
+; CHECK-LABEL: @fmax_float
+define float @fmax_float(float %a, float %b) #0 {
+; NOFTZ: call float @llvm.maxnum.f32
+; FTZ: call float @llvm.nvvm.fmax.f
+  %ret = call float @llvm.nvvm.fmax.f(float %a, float %b)
+  ret float %ret
+}
+; CHECK-LABEL: @fmax_float_ftz
+define float @fmax_float_ftz(float %a, float %b) #0 {
+; NOFTZ: call float @llvm.nvvm.fmax.ftz.f
+; FTZ: call float @llvm.maxnum.f32
+  %ret = call float @llvm.nvvm.fmax.ftz.f(float %a, float %b)
+  ret float %ret
+}
+
+; CHECK-LABEL: @fmin_double
+define double @fmin_double(double %a, double %b) #0 {
+; CHECK: call double @llvm.minnum.f64
+  %ret = call double @llvm.nvvm.fmin.d(double %a, double %b)
+  ret double %ret
+}
+; CHECK-LABEL: @fmin_float
+define float @fmin_float(float %a, float %b) #0 {
+; NOFTZ: call float @llvm.minnum.f32
+; FTZ: call float @llvm.nvvm.fmin.f
+  %ret = call float @llvm.nvvm.fmin.f(float %a, float %b)
+  ret float %ret
+}
+; CHECK-LABEL: @fmin_float_ftz
+define float @fmin_float_ftz(float %a, float %b) #0 {
+; NOFTZ: call float @llvm.nvvm.fmin.ftz.f
+; FTZ: call float @llvm.minnum.f32
+  %ret = call float @llvm.nvvm.fmin.ftz.f(float %a, float %b)
+  ret float %ret
+}
+
+; CHECK-LABEL: @round_double
+define double @round_double(double %a) #0 {
+; CHECK: call double @llvm.round.f64
+  %ret = call double @llvm.nvvm.round.d(double %a)
+  ret double %ret
+}
+; CHECK-LABEL: @round_float
+define float @round_float(float %a) #0 {
+; NOFTZ: call float @llvm.round.f32
+; FTZ: call float @llvm.nvvm.round.f
+  %ret = call float @llvm.nvvm.round.f(float %a)
+  ret float %ret
+}
+; CHECK-LABEL: @round_float_ftz
+define float @round_float_ftz(float %a) #0 {
+; NOFTZ: call float @llvm.nvvm.round.ftz.f
+; FTZ: call float @llvm.round.f32
+  %ret = call float @llvm.nvvm.round.ftz.f(float %a)
+  ret float %ret
+}
+
+; CHECK-LABEL: @trunc_double
+define double @trunc_double(double %a) #0 {
+; CHECK: call double @llvm.trunc.f64
+  %ret = call double @llvm.nvvm.trunc.d(double %a)
+  ret double %ret
+}
+; CHECK-LABEL: @trunc_float
+define float @trunc_float(float %a) #0 {
+; NOFTZ: call float @llvm.trunc.f32
+; FTZ: call float @llvm.nvvm.trunc.f
+  %ret = call float @llvm.nvvm.trunc.f(float %a)
+  ret float %ret
+}
+; CHECK-LABEL: @trunc_float_ftz
+define float @trunc_float_ftz(float %a) #0 {
+; NOFTZ: call float @llvm.nvvm.trunc.ftz.f
+; FTZ: call float @llvm.trunc.f32
+  %ret = call float @llvm.nvvm.trunc.ftz.f(float %a)
+  ret float %ret
+}
+
+; Check NVVM intrinsics that correspond to LLVM cast operations.
+
+; CHECK-LABEL: @test_d2i
+define i32 @test_d2i(double %a) #0 {
+; CHECK: fptosi double %a to i32
+  %ret = call i32 @llvm.nvvm.d2i.rz(double %a)
+  ret i32 %ret
+}
+; CHECK-LABEL: @test_f2i
+define i32 @test_f2i(float %a) #0 {
+; CHECK: fptosi float %a to i32
+  %ret = call i32 @llvm.nvvm.f2i.rz(float %a)
+  ret i32 %ret
+}
+; CHECK-LABEL: @test_d2ll
+define i64 @test_d2ll(double %a) #0 {
+; CHECK: fptosi double %a to i64
+  %ret = call i64 @llvm.nvvm.d2ll.rz(double %a)
+  ret i64 %ret
+}
+; CHECK-LABEL: @test_f2ll
+define i64 @test_f2ll(float %a) #0 {
+; CHECK: fptosi float %a to i64
+  %ret = call i64 @llvm.nvvm.f2ll.rz(float %a)
+  ret i64 %ret
+}
+; CHECK-LABEL: @test_d2ui
+define i32 @test_d2ui(double %a) #0 {
+; CHECK: fptoui double %a to i32
+  %ret = call i32 @llvm.nvvm.d2ui.rz(double %a)
+  ret i32 %ret
+}
+; CHECK-LABEL: @test_f2ui
+define i32 @test_f2ui(float %a) #0 {
+; CHECK: fptoui float %a to i32
+  %ret = call i32 @llvm.nvvm.f2ui.rz(float %a)
+  ret i32 %ret
+}
+; CHECK-LABEL: @test_d2ull
+define i64 @test_d2ull(double %a) #0 {
+; CHECK: fptoui double %a to i64
+  %ret = call i64 @llvm.nvvm.d2ull.rz(double %a)
+  ret i64 %ret
+}
+; CHECK-LABEL: @test_f2ull
+define i64 @test_f2ull(float %a) #0 {
+; CHECK: fptoui float %a to i64
+  %ret = call i64 @llvm.nvvm.f2ull.rz(float %a)
+  ret i64 %ret
+}
+
+; CHECK-LABEL: @test_i2d
+define double @test_i2d(i32 %a) #0 {
+; CHECK: sitofp i32 %a to double
+  %ret = call double @llvm.nvvm.i2d.rz(i32 %a)
+  ret double %ret
+}
+; CHECK-LABEL: @test_i2f
+define float @test_i2f(i32 %a) #0 {
+; CHECK: sitofp i32 %a to float
+  %ret = call float @llvm.nvvm.i2f.rz(i32 %a)
+  ret float %ret
+}
+; CHECK-LABEL: @test_ll2d
+define double @test_ll2d(i64 %a) #0 {
+; CHECK: sitofp i64 %a to double
+  %ret = call double @llvm.nvvm.ll2d.rz(i64 %a)
+  ret double %ret
+}
+; CHECK-LABEL: @test_ll2f
+define float @test_ll2f(i64 %a) #0 {
+; CHECK: sitofp i64 %a to float
+  %ret = call float @llvm.nvvm.ll2f.rz(i64 %a)
+  ret float %ret
+}
+; CHECK-LABEL: @test_ui2d
+define double @test_ui2d(i32 %a) #0 {
+; CHECK: uitofp i32 %a to double
+  %ret = call double @llvm.nvvm.ui2d.rz(i32 %a)
+  ret double %ret
+}
+; CHECK-LABEL: @test_ui2f
+define float @test_ui2f(i32 %a) #0 {
+; CHECK: uitofp i32 %a to float
+  %ret = call float @llvm.nvvm.ui2f.rz(i32 %a)
+  ret float %ret
+}
+; CHECK-LABEL: @test_ull2d
+define double @test_ull2d(i64 %a) #0 {
+; CHECK: uitofp i64 %a to double
+  %ret = call double @llvm.nvvm.ull2d.rz(i64 %a)
+  ret double %ret
+}
+; CHECK-LABEL: @test_ull2f
+define float @test_ull2f(i64 %a) #0 {
+; CHECK: uitofp i64 %a to float
+  %ret = call float @llvm.nvvm.ull2f.rz(i64 %a)
+  ret float %ret
+}
+
+; Check NVVM intrinsics that map to LLVM binary operations.
+
+; CHECK-LABEL: @test_add_rn_d
+define double @test_add_rn_d(double %a, double %b) #0 {
+; CHECK: fadd
+  %ret = call double @llvm.nvvm.add.rn.d(double %a, double %b)
+  ret double %ret
+}
+; CHECK-LABEL: @test_add_rn_f
+define float @test_add_rn_f(float %a, float %b) #0 {
+; NOFTZ: fadd
+; FTZ: call float @llvm.nvvm.add.rn.f
+  %ret = call float @llvm.nvvm.add.rn.f(float %a, float %b)
+  ret float %ret
+}
+; CHECK-LABEL: @test_add_rn_f_ftz
+define float @test_add_rn_f_ftz(float %a, float %b) #0 {
+; NOFTZ: call float @llvm.nvvm.add.rn.f
+; FTZ: fadd
+  %ret = call float @llvm.nvvm.add.rn.ftz.f(float %a, float %b)
+  ret float %ret
+}
+
+; CHECK-LABEL: @test_mul_rn_d
+define double @test_mul_rn_d(double %a, double %b) #0 {
+; CHECK: fmul
+  %ret = call double @llvm.nvvm.mul.rn.d(double %a, double %b)
+  ret double %ret
+}
+; CHECK-LABEL: @test_mul_rn_f
+define float @test_mul_rn_f(float %a, float %b) #0 {
+; NOFTZ: fmul
+; FTZ: call float @llvm.nvvm.mul.rn.f
+  %ret = call float @llvm.nvvm.mul.rn.f(float %a, float %b)
+  ret float %ret
+}
+; CHECK-LABEL: @test_mul_rn_f_ftz
+define float @test_mul_rn_f_ftz(float %a, float %b) #0 {
+; NOFTZ: call float @llvm.nvvm.mul.rn.f
+; FTZ: fmul
+  %ret = call float @llvm.nvvm.mul.rn.ftz.f(float %a, float %b)
+  ret float %ret
+}
+
+; CHECK-LABEL: @test_div_rn_d
+define double @test_div_rn_d(double %a, double %b) #0 {
+; CHECK: fdiv
+  %ret = call double @llvm.nvvm.div.rn.d(double %a, double %b)
+  ret double %ret
+}
+; CHECK-LABEL: @test_div_rn_f
+define float @test_div_rn_f(float %a, float %b) #0 {
+; NOFTZ: fdiv
+; FTZ: call float @llvm.nvvm.div.rn.f
+  %ret = call float @llvm.nvvm.div.rn.f(float %a, float %b)
+  ret float %ret
+}
+; CHECK-LABEL: @test_div_rn_f_ftz
+define float @test_div_rn_f_ftz(float %a, float %b) #0 {
+; NOFTZ: call float @llvm.nvvm.div.rn.f
+; FTZ: fdiv
+  %ret = call float @llvm.nvvm.div.rn.ftz.f(float %a, float %b)
+  ret float %ret
+}
+
+; Check NVVM intrinsics that require us to emit custom IR.
+
+; CHECK-LABEL: @test_rcp_rn_f
+define float @test_rcp_rn_f(float %a) #0 {
+; NOFTZ: fdiv float 1.0{{.*}} %a
+; FTZ: call float @llvm.nvvm.rcp.rn.f
+  %ret = call float @llvm.nvvm.rcp.rn.f(float %a)
+  ret float %ret
+}
+; CHECK-LABEL: @test_rcp_rn_f_ftz
+define float @test_rcp_rn_f_ftz(float %a) #0 {
+; NOFTZ: call float @llvm.nvvm.rcp.rn.f
+; FTZ: fdiv float 1.0{{.*}} %a
+  %ret = call float @llvm.nvvm.rcp.rn.ftz.f(float %a)
+  ret float %ret
+}
+
+; CHECK-LABEL: @test_sqrt_rn_d
+define double @test_sqrt_rn_d(double %a) #0 {
+; CHECK: call double @llvm.sqrt.f64(double %a)
+  %ret = call double @llvm.nvvm.sqrt.rn.d(double %a)
+  ret double %ret
+}
+; nvvm.sqrt.f is a special case: It goes to a llvm.sqrt.f
+; CHECK-LABEL: @test_sqrt_f
+define float @test_sqrt_f(float %a) #0 {
+; CHECK: call float @llvm.sqrt.f32(float %a)
+  %ret = call float @llvm.nvvm.sqrt.f(float %a)
+  ret float %ret
+}
+; CHECK-LABEL: @test_sqrt_rn_f
+define float @test_sqrt_rn_f(float %a) #0 {
+; NOFTZ: call float @llvm.sqrt.f32(float %a)
+; FTZ: call float @llvm.nvvm.sqrt.rn.f
+  %ret = call float @llvm.nvvm.sqrt.rn.f(float %a)
+  ret float %ret
+}
+; CHECK-LABEL: @test_sqrt_rn_f_ftz
+define float @test_sqrt_rn_f_ftz(float %a) #0 {
+; NOFTZ: call float @llvm.nvvm.sqrt.rn.f
+; FTZ: call float @llvm.sqrt.f32(float %a)
+  %ret = call float @llvm.nvvm.sqrt.rn.ftz.f(float %a)
+  ret float %ret
+}
+
+declare double @llvm.nvvm.add.rn.d(double, double)
+declare float @llvm.nvvm.add.rn.f(float, float)
+declare float @llvm.nvvm.add.rn.ftz.f(float, float)
+declare double @llvm.nvvm.ceil.d(double)
+declare float @llvm.nvvm.ceil.f(float)
+declare float @llvm.nvvm.ceil.ftz.f(float)
+declare float @llvm.nvvm.d2f.rm(double)
+declare float @llvm.nvvm.d2f.rm.ftz(double)
+declare float @llvm.nvvm.d2f.rp(double)
+declare float @llvm.nvvm.d2f.rp.ftz(double)
+declare float @llvm.nvvm.d2f.rz(double)
+declare float @llvm.nvvm.d2f.rz.ftz(double)
+declare i32 @llvm.nvvm.d2i.rz(double)
+declare i64 @llvm.nvvm.d2ll.rz(double)
+declare i32 @llvm.nvvm.d2ui.rz(double)
+declare i64 @llvm.nvvm.d2ull.rz(double)
+declare double @llvm.nvvm.div.rn.d(double, double)
+declare float @llvm.nvvm.div.rn.f(float, float)
+declare float @llvm.nvvm.div.rn.ftz.f(float, float)
+declare i16 @llvm.nvvm.f2h.rz(float)
+declare i16 @llvm.nvvm.f2h.rz.ftz(float)
+declare i32 @llvm.nvvm.f2i.rz(float)
+declare i32 @llvm.nvvm.f2i.rz.ftz(float)
+declare i64 @llvm.nvvm.f2ll.rz(float)
+declare i64 @llvm.nvvm.f2ll.rz.ftz(float)
+declare i32 @llvm.nvvm.f2ui.rz(float)
+declare i32 @llvm.nvvm.f2ui.rz.ftz(float)
+declare i64 @llvm.nvvm.f2ull.rz(float)
+declare i64 @llvm.nvvm.f2ull.rz.ftz(float)
+declare double @llvm.nvvm.fabs.d(double)
+declare float @llvm.nvvm.fabs.f(float)
+declare float @llvm.nvvm.fabs.ftz.f(float)
+declare double @llvm.nvvm.floor.d(double)
+declare float @llvm.nvvm.floor.f(float)
+declare float @llvm.nvvm.floor.ftz.f(float)
+declare double @llvm.nvvm.fma.rn.d(double, double, double)
+declare float @llvm.nvvm.fma.rn.f(float, float, float)
+declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float)
+declare double @llvm.nvvm.fmax.d(double, double)
+declare float @llvm.nvvm.fmax.f(float, float)
+declare float @llvm.nvvm.fmax.ftz.f(float, float)
+declare double @llvm.nvvm.fmin.d(double, double)
+declare float @llvm.nvvm.fmin.f(float, float)
+declare float @llvm.nvvm.fmin.ftz.f(float, float)
+declare double @llvm.nvvm.i2d.rz(i32)
+declare float @llvm.nvvm.i2f.rz(i32)
+declare double @llvm.nvvm.ll2d.rz(i64)
+declare float @llvm.nvvm.ll2f.rz(i64)
+declare double @llvm.nvvm.lohi.i2d(i32, i32)
+declare double @llvm.nvvm.mul.rn.d(double, double)
+declare float @llvm.nvvm.mul.rn.f(float, float)
+declare float @llvm.nvvm.mul.rn.ftz.f(float, float)
+declare double @llvm.nvvm.rcp.rm.d(double)
+declare double @llvm.nvvm.rcp.rn.d(double)
+declare float @llvm.nvvm.rcp.rn.f(float)
+declare float @llvm.nvvm.rcp.rn.ftz.f(float)
+declare double @llvm.nvvm.round.d(double)
+declare float @llvm.nvvm.round.f(float)
+declare float @llvm.nvvm.round.ftz.f(float)
+declare float @llvm.nvvm.sqrt.f(float)
+declare double @llvm.nvvm.sqrt.rn.d(double)
+declare float @llvm.nvvm.sqrt.rn.f(float)
+declare float @llvm.nvvm.sqrt.rn.ftz.f(float)
+declare double @llvm.nvvm.trunc.d(double)
+declare float @llvm.nvvm.trunc.f(float)
+declare float @llvm.nvvm.trunc.ftz.f(float)
+declare double @llvm.nvvm.ui2d.rz(i32)
+declare float @llvm.nvvm.ui2f.rn(i32)
+declare float @llvm.nvvm.ui2f.rz(i32)
+declare double @llvm.nvvm.ull2d.rz(i64)
+declare float @llvm.nvvm.ull2f.rz(i64)
diff --git a/test/Transforms/InstCombine/objsize.ll b/test/Transforms/InstCombine/objsize.ll
index 2af391f907cc..5c0a36f5feaa 100644
--- a/test/Transforms/InstCombine/objsize.ll
+++ b/test/Transforms/InstCombine/objsize.ll
@@ -8,7 +8,7 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 define i32 @foo() nounwind {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT: ret i32 60
-  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false)
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false, i1 false)
   ret i32 %1
 }
 
@@ -16,7 +16,7 @@ define i8* @bar() nounwind {
 ; CHECK-LABEL: @bar(
 entry:
   %retval = alloca i8*
-  %0 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false)
+  %0 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false, i1 false)
   %cmp = icmp ne i32 %0, -1
 ; CHECK: br i1 true
   br i1 %cmp, label %cond.true, label %cond.false
@@ -33,7 +33,7 @@ cond.false:
 define i32 @f() nounwind {
 ; CHECK-LABEL: @f(
 ; CHECK-NEXT: ret i32 0
-  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr ([60 x i8], [60 x i8]* @a, i32 1, i32 0), i1 false)
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr ([60 x i8], [60 x i8]* @a, i32 1, i32 0), i1 false, i1 false)
   ret i32 %1
 }
 
@@ -42,7 +42,7 @@ define i32 @f() nounwind {
 define i1 @baz() nounwind {
 ; CHECK-LABEL: @baz(
 ; CHECK-NEXT: objectsize
-  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([0 x i8], [0 x i8]* @window, i32 0, i32 0), i1 false)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([0 x i8], [0 x i8]* @window, i32 0, i32 0), i1 false, i1 false)
   %2 = icmp eq i32 %1, -1
   ret i1 %2
 }
@@ -51,7 +51,7 @@ define void @test1(i8* %q, i32 %x) nounwind noinline {
 ; CHECK-LABEL: @test1(
 ; CHECK: objectsize.i32.p0i8
 entry:
-  %0 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([0 x i8], [0 x i8]* @window, i32 0, i32 10), i1 false) ; <i64> [#uses=1]
+  %0 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([0 x i8], [0 x i8]* @window, i32 0, i32 10), i1 false, i1 false) ; <i64> [#uses=1]
   %1 = icmp eq i32 %0, -1                         ; <i1> [#uses=1]
   br i1 %1, label %"47", label %"46"
 
@@ -67,7 +67,7 @@ entry:
 define i32 @test2() nounwind {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT: ret i32 34
-  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr (i8, i8* bitcast ([9 x i32]* @.str5 to i8*), i32 2), i1 false)
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr (i8, i8* bitcast ([9 x i32]* @.str5 to i8*), i32 2), i1 false, i1 false)
   ret i32 %1
 }
 
@@ -76,7 +76,9 @@ define i32 @test2() nounwind {
 
 declare i8* @__memcpy_chk(i8*, i8*, i32, i32) nounwind
 
-declare i32 @llvm.objectsize.i32.p0i8(i8*, i1) nounwind readonly
+declare i32 @llvm.objectsize.i32.p0i8(i8*, i1, i1) nounwind readonly
+
+declare i32 @llvm.objectsize.i32.p1i8(i8 addrspace(1)*, i1, i1) nounwind readonly
 
 declare i8* @__inline_memcpy_chk(i8*, i8*, i32) nounwind inlinehint
 
@@ -88,7 +90,7 @@ entry:
 bb11:
   %0 = getelementptr inbounds float, float* getelementptr inbounds ([480 x float], [480 x float]* @array, i32 0, i32 128), i32 -127 ; <float*> [#uses=1]
   %1 = bitcast float* %0 to i8*                   ; <i8*> [#uses=1]
-  %2 = call i32 @llvm.objectsize.i32.p0i8(i8* %1, i1 false) ; <i32> [#uses=1]
+  %2 = call i32 @llvm.objectsize.i32.p0i8(i8* %1, i1 false, i1 false) ; <i32> [#uses=1]
   %3 = call i8* @__memcpy_chk(i8* undef, i8* undef, i32 512, i32 %2) nounwind ; <i8*> [#uses=0]
 ; CHECK: unreachable
   unreachable
@@ -110,7 +112,7 @@ define i32 @test4(i8** %esc) nounwind ssp {
 entry:
   %0 = alloca %struct.data, align 8
   %1 = bitcast %struct.data* %0 to i8*
-  %2 = call i32 @llvm.objectsize.i32.p0i8(i8* %1, i1 false) nounwind
+  %2 = call i32 @llvm.objectsize.i32.p0i8(i8* %1, i1 false, i1 false) nounwind
 ; CHECK-NOT: @llvm.objectsize
 ; CHECK: @llvm.memset.p0i8.i32(i8* %1, i8 0, i32 1824, i32 8, i1 false)
   %3 = call i8* @__memset_chk(i8* %1, i32 0, i32 1824, i32 %2) nounwind
@@ -125,7 +127,7 @@ define i8* @test5(i32 %n) nounwind ssp {
 ; CHECK-LABEL: @test5(
 entry:
   %0 = tail call noalias i8* @malloc(i32 20) nounwind
-  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %0, i1 false)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %0, i1 false, i1 false)
   %2 = load i8*, i8** @s, align 8
 ; CHECK-NOT: @llvm.objectsize
 ; CHECK: @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 10, i32 1, i1 false)
@@ -137,7 +139,7 @@ define void @test6(i32 %n) nounwind ssp {
 ; CHECK-LABEL: @test6(
 entry:
   %0 = tail call noalias i8* @malloc(i32 20) nounwind
-  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %0, i1 false)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %0, i1 false, i1 false)
   %2 = load i8*, i8** @s, align 8
 ; CHECK-NOT: @llvm.objectsize
 ; CHECK: @__memcpy_chk(i8* %0, i8* %1, i32 30, i32 20)
@@ -154,7 +156,7 @@ define i32 @test7(i8** %esc) {
   %alloc = call noalias i8* @malloc(i32 48) nounwind
   store i8* %alloc, i8** %esc
   %gep = getelementptr inbounds i8, i8* %alloc, i32 16
-  %objsize = call i32 @llvm.objectsize.i32.p0i8(i8* %gep, i1 false) nounwind readonly
+  %objsize = call i32 @llvm.objectsize.i32.p0i8(i8* %gep, i1 false, i1 false) nounwind readonly
 ; CHECK: ret i32 32
   ret i32 %objsize
 }
@@ -166,7 +168,7 @@ define i32 @test8(i8** %esc) {
   %alloc = call noalias i8* @calloc(i32 5, i32 7) nounwind
   store i8* %alloc, i8** %esc
   %gep = getelementptr inbounds i8, i8* %alloc, i32 5
-  %objsize = call i32 @llvm.objectsize.i32.p0i8(i8* %gep, i1 false) nounwind readonly
+  %objsize = call i32 @llvm.objectsize.i32.p0i8(i8* %gep, i1 false, i1 false) nounwind readonly
 ; CHECK: ret i32 30
   ret i32 %objsize
 }
@@ -178,7 +180,7 @@ declare noalias i8* @strndup(i8* nocapture, i32) nounwind
 define i32 @test9(i8** %esc) {
   %call = tail call i8* @strdup(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i64 0, i64 0)) nounwind
   store i8* %call, i8** %esc, align 8
-  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true, i1 false)
 ; CHECK: ret i32 8
   ret i32 %1
 }
@@ -187,7 +189,7 @@ define i32 @test9(i8** %esc) {
 define i32 @test10(i8** %esc) {
   %call = tail call i8* @strndup(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i64 0, i64 0), i32 3) nounwind
   store i8* %call, i8** %esc, align 8
-  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true, i1 false)
 ; CHECK: ret i32 4
   ret i32 %1
 }
@@ -196,7 +198,7 @@ define i32 @test10(i8** %esc) {
 define i32 @test11(i8** %esc) {
   %call = tail call i8* @strndup(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i64 0, i64 0), i32 7) nounwind
   store i8* %call, i8** %esc, align 8
-  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true, i1 false)
 ; CHECK: ret i32 8
   ret i32 %1
 }
@@ -205,7 +207,7 @@ define i32 @test11(i8** %esc) {
 define i32 @test12(i8** %esc) {
   %call = tail call i8* @strndup(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i64 0, i64 0), i32 8) nounwind
   store i8* %call, i8** %esc, align 8
-  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true, i1 false)
 ; CHECK: ret i32 8
   ret i32 %1
 }
@@ -214,7 +216,7 @@ define i32 @test12(i8** %esc) {
 define i32 @test13(i8** %esc) {
   %call = tail call i8* @strndup(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i64 0, i64 0), i32 57) nounwind
   store i8* %call, i8** %esc, align 8
-  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true, i1 false)
 ; CHECK: ret i32 8
   ret i32 %1
 }
@@ -225,7 +227,7 @@ define i32 @test13(i8** %esc) {
 ; CHECK-NEXT: ret i32 60
 define i32 @test18() {
   %bc = bitcast [60 x i8]* @globalalias to i8*
-  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* %bc, i1 false)
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* %bc, i1 false, i1 false)
   ret i32 %1
 }
 
@@ -235,7 +237,67 @@ define i32 @test18() {
 ; CHECK: llvm.objectsize
 define i32 @test19() {
   %bc = bitcast [60 x i8]* @globalalias2 to i8*
-  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* %bc, i1 false)
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* %bc, i1 false, i1 false)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test20(
+; CHECK: ret i32 0
+define i32 @test20() {
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* null, i1 false, i1 false)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test21(
+; CHECK: ret i32 0
+define i32 @test21() {
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* null, i1 true, i1 false)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test22(
+; CHECK: llvm.objectsize
+define i32 @test22() {
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* null, i1 false, i1 true)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test23(
+; CHECK: llvm.objectsize
+define i32 @test23() {
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* null, i1 true, i1 true)
   ret i32 %1
 }
 
+; 1 is an arbitrary non-zero address space.
+; CHECK-LABEL: @test24(
+; CHECK: ret i32 0
+define i32 @test24() {
+  %1 = call i32 @llvm.objectsize.i32.p1i8(i8 addrspace(1)* null, i1 false,
+                                          i1 false)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test25(
+; CHECK: ret i32 0
+define i32 @test25() {
+  %1 = call i32 @llvm.objectsize.i32.p1i8(i8 addrspace(1)* null, i1 true,
+                                          i1 false)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test26(
+; CHECK: ret i32 0
+define i32 @test26() {
+  %1 = call i32 @llvm.objectsize.i32.p1i8(i8 addrspace(1)* null, i1 false,
+                                          i1 true)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test27(
+; CHECK: ret i32 0
+define i32 @test27() {
+  %1 = call i32 @llvm.objectsize.i32.p1i8(i8 addrspace(1)* null, i1 true,
+                                          i1 true)
+  ret i32 %1
+}
diff --git a/test/Transforms/InstCombine/or.ll b/test/Transforms/InstCombine/or.ll
index 2c9088428bde..41e6d2d1f827 100644
--- a/test/Transforms/InstCombine/or.ll
+++ b/test/Transforms/InstCombine/or.ll
@@ -207,19 +207,6 @@ define <2 x i1> @test18vec(<2 x i32> %A) {
   ret <2 x i1> %D
 }
 
-define i1 @test19(i32 %A) {
-; CHECK-LABEL: @test19(
-; CHECK-NEXT:    [[TMP1:%.*]] = or i32 %A, 1
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 51
-; CHECK-NEXT:    ret i1 [[TMP2]]
-;
-  %B = icmp eq i32 %A, 50
-  %C = icmp eq i32 %A, 51
-  ;; (A&-2) == 50
-  %D = or i1 %B, %C
-  ret i1 %D
-}
-
 define i32 @test20(i32 %x) {
 ; CHECK-LABEL: @test20(
 ; CHECK-NEXT:    ret i32 %x
@@ -490,7 +477,7 @@ define i32 @orsext_to_sel_multi_use(i32 %x, i1 %y) {
 ; CHECK-LABEL: @orsext_to_sel_multi_use(
 ; CHECK-NEXT:    [[SEXT:%.*]] = sext i1 %y to i32
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SEXT]], %x
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SEXT]], [[OR]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[OR]], [[SEXT]]
 ; CHECK-NEXT:    ret i32 [[ADD]]
 ;
   %sext = sext i1 %y to i32
@@ -521,7 +508,7 @@ define <2 x i132> @orsext_to_sel_vec_swap(<2 x i132> %x, <2 x i1> %y) {
 
 define i32 @test39(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test39(
-; CHECK-NEXT:    [[OR:%.*]] = or i32 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i32 %b, %a
 ; CHECK-NEXT:    ret i32 [[OR]]
 ;
   %xor = xor i32 %a, -1
@@ -542,6 +529,42 @@ define i32 @test40(i32 %a, i32 %b) {
   ret i32 %or
 }
 
+define i32 @test40b(i32 %a, i32 %b) {
+; CHECK-LABEL: @test40b(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 %a, -1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XOR]], %b
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %and = and i32 %b, %a
+  %xor = xor i32 %a, -1
+  %or = or i32 %and, %xor
+  ret i32 %or
+}
+
+define i32 @test40c(i32 %a, i32 %b) {
+; CHECK-LABEL: @test40c(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 %a, -1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XOR]], %b
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %and = and i32 %b, %a
+  %xor = xor i32 %a, -1
+  %or = or i32 %xor, %and
+  ret i32 %or
+}
+
+define i32 @test40d(i32 %a, i32 %b) {
+; CHECK-LABEL: @test40d(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 %a, -1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XOR]], %b
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %and = and i32 %a, %b
+  %xor = xor i32 %a, -1
+  %or = or i32 %xor, %and
+  ret i32 %or
+}
+
 define i32 @test41(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test41(
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 %a, -1
@@ -701,3 +724,138 @@ define i1 @test48(i64 %x, i1 %b) {
   %3 = or i1 %1, %.b
   ret i1 %3
 }
+
+define i32 @test49(i1 %C) {
+; CHECK-LABEL: @test49(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], i32 1019, i32 123
+; CHECK-NEXT:    ret i32 [[V]]
+;
+  %A = select i1 %C, i32 1000, i32 10
+  %V = or i32 %A, 123
+  ret i32 %V
+}
+
+define <2 x i32> @test49vec(i1 %C) {
+; CHECK-LABEL: @test49vec(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], <2 x i32> <i32 1019, i32 1019>, <2 x i32> <i32 123, i32 123>
+; CHECK-NEXT:    ret <2 x i32> [[V]]
+;
+  %A = select i1 %C, <2 x i32> <i32 1000, i32 1000>, <2 x i32> <i32 10, i32 10>
+  %V = or <2 x i32> %A, <i32 123, i32 123>
+  ret <2 x i32> %V
+}
+
+define <2 x i32> @test49vec2(i1 %C) {
+; CHECK-LABEL: @test49vec2(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], <2 x i32> <i32 1019, i32 2509>, <2 x i32> <i32 123, i32 351>
+; CHECK-NEXT:    ret <2 x i32> [[V]]
+;
+  %A = select i1 %C, <2 x i32> <i32 1000, i32 2500>, <2 x i32> <i32 10, i32 30>
+  %V = or <2 x i32> %A, <i32 123, i32 333>
+  ret <2 x i32> %V
+}
+
+define i32 @test50(i1 %which) {
+; CHECK-LABEL: @test50(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi i32 [ 1019, [[ENTRY:%.*]] ], [ 123, [[DELAY]] ]
+; CHECK-NEXT:    ret i32 [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi i32 [ 1000, %entry ], [ 10, %delay ]
+  %value = or i32 %A, 123
+  ret i32 %value
+}
+
+define <2 x i32> @test50vec(i1 %which) {
+; CHECK-LABEL: @test50vec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi <2 x i32> [ <i32 1019, i32 1019>, [[ENTRY:%.*]] ], [ <i32 123, i32 123>, [[DELAY]] ]
+; CHECK-NEXT:    ret <2 x i32> [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi <2 x i32> [ <i32 1000, i32 1000>, %entry ], [ <i32 10, i32 10>, %delay ]
+  %value = or <2 x i32> %A, <i32 123, i32 123>
+  ret <2 x i32> %value
+}
+
+define <2 x i32> @test50vec2(i1 %which) {
+; CHECK-LABEL: @test50vec2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi <2 x i32> [ <i32 1019, i32 2509>, [[ENTRY:%.*]] ], [ <i32 123, i32 351>, [[DELAY]] ]
+; CHECK-NEXT:    ret <2 x i32> [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi <2 x i32> [ <i32 1000, i32 2500>, %entry ], [ <i32 10, i32 30>, %delay ]
+  %value = or <2 x i32> %A, <i32 123, i32 333>
+  ret <2 x i32> %value
+}
+
+define i8 @test51(i8 %a, i8 %b, i8 %c) {
+; CHECK-LABEL: @test51(
+; CHECK-NEXT:    [[W:%.*]] = mul i8 [[B:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[X:%.*]] = or i8 [[W]], [[A:%.*]]
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %w = mul i8 %b, %c
+  %z = xor i8 %a, -1
+  %y = and i8 %w, %z
+  %x = or i8 %y, %a
+  ret i8 %x
+}
+
+define i8 @test52(i8 %a, i8 %b, i8 %c) {
+; CHECK-LABEL: @test52(
+; CHECK-NEXT:    [[W:%.*]] = mul i8 [[B:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[X:%.*]] = or i8 [[W]], [[A:%.*]]
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %w = mul i8 %b, %c
+  %z = xor i8 %w, -1
+  %y = and i8 %z, %a
+  %x = or i8 %w, %y
+  ret i8 %x
+}
+
+define i8 @test53(i8 %a, i8 %b, i8 %c) {
+; CHECK-LABEL: @test53(
+; CHECK-NEXT:    [[W:%.*]] = mul i8 [[B:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[X:%.*]] = or i8 [[W]], [[A:%.*]]
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %w = mul i8 %b, %c
+  %z = xor i8 %w, -1
+  %y = and i8 %z, %a
+  %x = or i8 %w, %y
+  ret i8 %x
+}
diff --git a/test/Transforms/InstCombine/phi-select-constant.ll b/test/Transforms/InstCombine/phi-select-constant.ll
new file mode 100644
index 000000000000..272594d7f4f9
--- /dev/null
+++ b/test/Transforms/InstCombine/phi-select-constant.ll
@@ -0,0 +1,57 @@
+; RUN: opt < %s -S -instcombine | FileCheck %s
+@A = extern_weak global i32, align 4
+@B = extern_weak global i32, align 4
+
+define i32 @foo(i1 %which) {
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+; CHECK-LABEL: @foo
+; CHECK-LABEL: final:
+; CHECK: phi i32 [ 1, %entry ], [ select (i1 icmp eq (i32* @A, i32* @B), i32 2, i32 1), %delay ]
+final:
+  %use2 = phi i1 [ false, %entry ], [ icmp eq (i32* @A, i32* @B), %delay ]
+  %value = select i1 %use2, i32 2, i32 1
+  ret i32 %value
+}
+
+
+; test folding of select into phi for vectors.
+define <4 x i64> @vec1(i1 %which) {
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+ br label %final
+
+final:
+; CHECK-LABEL: @vec1
+; CHECK-LABEL: final:
+; CHECK: %phinode = phi <4 x i64> [ zeroinitializer, %entry ], [ <i64 0, i64 0, i64 126, i64 127>, %delay ]
+; CHECK-NOT: select
+; CHECK: ret <4 x i64> %phinode
+ %phinode =  phi <4 x i1> [ <i1 true, i1 true, i1 true, i1 true>, %entry ], [ <i1 true, i1 true, i1 false, i1 false>, %delay ]
+ %sel = select <4 x i1> %phinode, <4 x i64> zeroinitializer, <4 x i64> <i64 124, i64 125, i64 126, i64 127>
+ ret <4 x i64> %sel
+}
+
+define <4 x i64> @vec2(i1 %which) {
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+ br label %final
+
+final:
+; CHECK-LABEL: @vec2
+; CHECK-LABEL: final:
+; CHECK: %phinode = phi <4 x i64> [ <i64 124, i64 125, i64 126, i64 127>, %entry ], [ <i64 0, i64 125, i64 0, i64 127>, %delay ]
+; CHECK-NOT: select
+; CHECK: ret <4 x i64> %phinode
+ %phinode =  phi <4 x i1> [ <i1 false, i1 false, i1 false, i1 false>, %entry ], [ <i1 true, i1 false, i1 true, i1 false>, %delay ]
+ %sel = select <4 x i1> %phinode, <4 x i64> zeroinitializer, <4 x i64> <i64 124, i64 125, i64 126, i64 127>
+ ret <4 x i64> %sel
+}
diff --git a/test/Transforms/InstCombine/phi-select-constexpr.ll b/test/Transforms/InstCombine/phi-select-constexpr.ll
deleted file mode 100644
index 054e0691d47a..000000000000
--- a/test/Transforms/InstCombine/phi-select-constexpr.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: opt < %s -S -instcombine | FileCheck %s
-@A = extern_weak global i32, align 4
-@B = extern_weak global i32, align 4
-
-define i32 @foo(i1 %which) {
-entry:
-  br i1 %which, label %final, label %delay
-
-delay:
-  br label %final
-
-; CHECK-LABEL: final:
-; CHECK: phi i32 [ 1, %entry ], [ select (i1 icmp eq (i32* @A, i32* @B), i32 2, i32 1), %delay ]
-final:
-  %use2 = phi i1 [ false, %entry ], [ icmp eq (i32* @A, i32* @B), %delay ]
-  %value = select i1 %use2, i32 2, i32 1
-  ret i32 %value
-}
-
diff --git a/test/Transforms/InstCombine/pow-1.ll b/test/Transforms/InstCombine/pow-1.ll
index c9f71fd45721..602c20a1314b 100644
--- a/test/Transforms/InstCombine/pow-1.ll
+++ b/test/Transforms/InstCombine/pow-1.ll
@@ -72,7 +72,7 @@ define float @test_simplify7(float %x) {
 ; CHECK-LABEL: @test_simplify7(
   %retval = call float @powf(float %x, float 0.5)
 ; CHECK-NEXT: [[SQRTF:%[a-z0-9]+]] = call float @sqrtf(float %x) [[NUW_RO:#[0-9]+]]
-; CHECK-NEXT: [[FABSF:%[a-z0-9]+]] = call float @fabsf(float [[SQRTF]]) [[NUW_RO]]
+; CHECK-NEXT: [[FABSF:%[a-z0-9]+]] = call float @llvm.fabs.f32(float [[SQRTF]])
 ; CHECK-NEXT: [[FCMP:%[a-z0-9]+]] = fcmp oeq float %x, 0xFFF0000000000000
 ; CHECK-NEXT: [[SELECT:%[a-z0-9]+]] = select i1 [[FCMP]], float 0x7FF0000000000000, float [[FABSF]]
   ret float %retval
@@ -83,7 +83,7 @@ define double @test_simplify8(double %x) {
 ; CHECK-LABEL: @test_simplify8(
   %retval = call double @pow(double %x, double 0.5)
 ; CHECK-NEXT: [[SQRT:%[a-z0-9]+]] = call double @sqrt(double %x) [[NUW_RO]]
-; CHECK-NEXT: [[FABS:%[a-z0-9]+]] = call double @fabs(double [[SQRT]]) [[NUW_RO]]
+; CHECK-NEXT: [[FABS:%[a-z0-9]+]] = call double @llvm.fabs.f64(double [[SQRT]])
 ; CHECK-NEXT: [[FCMP:%[a-z0-9]+]] = fcmp oeq double %x, 0xFFF0000000000000
 ; CHECK-NEXT: [[SELECT:%[a-z0-9]+]] = select i1 [[FCMP]], double 0x7FF0000000000000, double [[FABS]]
   ret double %retval
@@ -163,7 +163,7 @@ define double @test_simplify17(double %x) {
 ; CHECK-LABEL: @test_simplify17(
   %retval = call double @llvm.pow.f64(double %x, double 0.5)
 ; CHECK-NEXT: [[SQRT:%[a-z0-9]+]] = call double @sqrt(double %x)
-; CHECK-NEXT: [[FABS:%[a-z0-9]+]] = call double @fabs(double [[SQRT]])
+; CHECK-NEXT: [[FABS:%[a-z0-9]+]] = call double @llvm.fabs.f64(double [[SQRT]])
 ; CHECK-NEXT: [[FCMP:%[a-z0-9]+]] = fcmp oeq double %x, 0xFFF0000000000000
 ; CHECK-NEXT: [[SELECT:%[a-z0-9]+]] = select i1 [[FCMP]], double 0x7FF0000000000000, double [[FABS]]
   ret double %retval
diff --git a/test/Transforms/InstCombine/pr17827.ll b/test/Transforms/InstCombine/pr17827.ll
index a3ed5e1697ec..ada6edab69c6 100644
--- a/test/Transforms/InstCombine/pr17827.ll
+++ b/test/Transforms/InstCombine/pr17827.ll
@@ -48,14 +48,14 @@ define i1 @test_shift_and_cmp_changed1(i8 %p, i8 %q) {
 }
 
 ; FIXME: Vectors should fold the same way.
+
 define <2 x i1> @test_shift_and_cmp_changed1_vec(<2 x i8> %p, <2 x i8> %q) {
 ; CHECK-LABEL: @test_shift_and_cmp_changed1_vec(
 ; CHECK-NEXT:    [[ANDP:%.*]] = and <2 x i8> %p, <i8 6, i8 6>
 ; CHECK-NEXT:    [[ANDQ:%.*]] = and <2 x i8> %q, <i8 8, i8 8>
 ; CHECK-NEXT:    [[OR:%.*]] = or <2 x i8> [[ANDQ]], [[ANDP]]
 ; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i8> [[OR]], <i8 5, i8 5>
-; CHECK-NEXT:    [[ASHR:%.*]] = ashr <2 x i8> [[SHL]], <i8 5, i8 5>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[ASHR]], <i8 1, i8 1>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[SHL]], <i8 32, i8 32>
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %andp = and <2 x i8> %p, <i8 6, i8 6>
diff --git a/test/Transforms/InstCombine/pr19420.ll b/test/Transforms/InstCombine/pr19420.ll
index 23fa0a409745..015f35eaaa53 100644
--- a/test/Transforms/InstCombine/pr19420.ll
+++ b/test/Transforms/InstCombine/pr19420.ll
@@ -1,36 +1,44 @@
 ; RUN: opt -S -instcombine < %s | FileCheck %s
 
-; CHECK-LABEL: @test_FoldShiftByConstant_CreateSHL
-; CHECK: mul <4 x i32> %in, <i32 0, i32 -32, i32 0, i32 -32>
-; CHECK-NEXT: ret
 define <4 x i32> @test_FoldShiftByConstant_CreateSHL(<4 x i32> %in) {
+; CHECK-LABEL: @test_FoldShiftByConstant_CreateSHL(
+; CHECK-NEXT:    [[VSHL_N:%.*]] = mul <4 x i32> %in, <i32 0, i32 -32, i32 0, i32 -32>
+; CHECK-NEXT:    ret <4 x i32> [[VSHL_N]]
+;
   %mul.i = mul <4 x i32> %in, <i32 0, i32 -1, i32 0, i32 -1>
   %vshl_n = shl <4 x i32> %mul.i, <i32 5, i32 5, i32 5, i32 5>
   ret <4 x i32> %vshl_n
 }
 
-; CHECK-LABEL: @test_FoldShiftByConstant_CreateSHL2
-; CHECK: mul <8 x i16> %in, <i16 0, i16 -32, i16 0, i16 -32, i16 0, i16 -32, i16 0, i16 -32>
-; CHECK-NEXT: ret
 define <8 x i16> @test_FoldShiftByConstant_CreateSHL2(<8 x i16> %in) {
+; CHECK-LABEL: @test_FoldShiftByConstant_CreateSHL2(
+; CHECK-NEXT:    [[VSHL_N:%.*]] = mul <8 x i16> %in, <i16 0, i16 -32, i16 0, i16 -32, i16 0, i16 -32, i16 0, i16 -32>
+; CHECK-NEXT:    ret <8 x i16> [[VSHL_N]]
+;
   %mul.i = mul <8 x i16> %in, <i16 0, i16 -1, i16 0, i16 -1, i16 0, i16 -1, i16 0, i16 -1>
   %vshl_n = shl <8 x i16> %mul.i, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
   ret <8 x i16> %vshl_n
 }
 
-; CHECK-LABEL: @test_FoldShiftByConstant_CreateAnd
-; CHECK: mul <16 x i8> %in0, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
-; CHECK-NEXT: and <16 x i8> %vsra_n2, <i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32>
-; CHECK-NEXT: ret
 define <16 x i8> @test_FoldShiftByConstant_CreateAnd(<16 x i8> %in0) {
+; CHECK-LABEL: @test_FoldShiftByConstant_CreateAnd(
+; CHECK-NEXT:    [[VSRA_N2:%.*]] = mul <16 x i8> %in0, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
+; CHECK-NEXT:    [[VSHL_N:%.*]] = and <16 x i8> [[VSRA_N2]], <i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32>
+; CHECK-NEXT:    ret <16 x i8> [[VSHL_N]]
+;
   %vsra_n = ashr <16 x i8> %in0, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
   %tmp = add <16 x i8> %in0, %vsra_n
   %vshl_n = shl <16 x i8> %tmp, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
   ret <16 x i8> %vshl_n
 }
 
-
 define i32 @bar(i32 %x, i32 %y) {
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:    [[B1:%.*]] = shl i32 %y, 4
+; CHECK-NEXT:    [[A2:%.*]] = add i32 [[B1]], %x
+; CHECK-NEXT:    [[C:%.*]] = and i32 [[A2]], -16
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = lshr i32 %x, 4
   %b = add i32 %a, %y
   %c = shl i32 %b, 4
@@ -38,16 +46,25 @@ define i32 @bar(i32 %x, i32 %y) {
 }
 
 define <2 x i32> @bar_v2i32(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @bar_v2i32(
+; CHECK-NEXT:    [[B1:%.*]] = shl <2 x i32> %y, <i32 5, i32 5>
+; CHECK-NEXT:    [[A2:%.*]] = add <2 x i32> [[B1]], %x
+; CHECK-NEXT:    [[C:%.*]] = and <2 x i32> [[A2]], <i32 -32, i32 -32>
+; CHECK-NEXT:    ret <2 x i32> [[C]]
+;
   %a = lshr <2 x i32> %x, <i32 5, i32 5>
   %b = add <2 x i32> %a, %y
   %c = shl <2 x i32> %b, <i32 5, i32 5>
   ret <2 x i32> %c
 }
 
-
-
-
 define i32 @foo(i32 %x, i32 %y) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[C1:%.*]] = shl i32 %y, 4
+; CHECK-NEXT:    [[X_MASK:%.*]] = and i32 %x, 128
+; CHECK-NEXT:    [[D:%.*]] = add i32 [[X_MASK]], [[C1]]
+; CHECK-NEXT:    ret i32 [[D]]
+;
   %a = lshr i32 %x, 4
   %b = and i32 %a, 8
   %c = add i32 %b, %y
@@ -56,6 +73,13 @@ define i32 @foo(i32 %x, i32 %y) {
 }
 
 define <2 x i32> @foo_v2i32(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @foo_v2i32(
+; CHECK-NEXT:    [[A:%.*]] = lshr <2 x i32> %x, <i32 4, i32 4>
+; CHECK-NEXT:    [[B:%.*]] = and <2 x i32> [[A]], <i32 8, i32 8>
+; CHECK-NEXT:    [[C:%.*]] = add <2 x i32> [[B]], %y
+; CHECK-NEXT:    [[D:%.*]] = shl <2 x i32> [[C]], <i32 4, i32 4>
+; CHECK-NEXT:    ret <2 x i32> [[D]]
+;
   %a = lshr <2 x i32> %x, <i32 4, i32 4>
   %b = and <2 x i32> %a, <i32 8, i32 8>
   %c = add <2 x i32> %b, %y
@@ -63,5 +87,3 @@ define <2 x i32> @foo_v2i32(<2 x i32> %x, <2 x i32> %y) {
   ret <2 x i32> %d
 }
 
-
-
diff --git a/test/Transforms/InstCombine/pr31990_wrong_memcpy.ll b/test/Transforms/InstCombine/pr31990_wrong_memcpy.ll
new file mode 100644
index 000000000000..62ecd0311ffd
--- /dev/null
+++ b/test/Transforms/InstCombine/pr31990_wrong_memcpy.ll
@@ -0,0 +1,26 @@
+; RUN: opt -S -instcombine %s -o - | FileCheck %s
+
+; Regression test of PR31990. A memcpy of one byte, copying 0xff, was
+; replaced with a single store of an i4 0xf.
+
+@g = constant i8 -1
+
+define void @foo() {
+entry:
+  %0 = alloca i8
+  %1 = bitcast i8* %0 to i4*
+  call void @bar(i4* %1)
+  %2 = bitcast i4* %1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %2, i8* @g, i32 1, i32 1, i1 false)
+  call void @gaz(i8* %2)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly,
+                                        i8* nocapture readonly, i32, i32, i1)
+declare void @bar(i4*)
+declare void @gaz(i8*)
+
+; The mempcy should be simplified to a single store of an i8, not i4
+; CHECK: store i8 -1
+; CHECK-NOT: store i4 -1
diff --git a/test/Transforms/InstCombine/prefetch-load.ll b/test/Transforms/InstCombine/prefetch-load.ll
new file mode 100644
index 000000000000..f98b7ae00bf1
--- /dev/null
+++ b/test/Transforms/InstCombine/prefetch-load.ll
@@ -0,0 +1,34 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+%struct.C = type { %struct.C*, i32 }
+
+; Check that we instcombine the load across the prefetch.
+
+; CHECK-LABEL: define signext i32 @foo
+define signext i32 @foo(%struct.C* %c) local_unnamed_addr #0 {
+; CHECK: store i32 %dec, i32* %length_
+; CHECK-NOT: load
+; CHECK: llvm.prefetch
+; CHECK-NEXT: ret
+entry:
+  %next_ = getelementptr inbounds %struct.C, %struct.C* %c, i32 0, i32 0
+  %0 = load %struct.C*, %struct.C** %next_, align 8
+  %next_1 = getelementptr inbounds %struct.C, %struct.C* %0, i32 0, i32 0
+  %1 = load %struct.C*, %struct.C** %next_1, align 8
+  store %struct.C* %1, %struct.C** %next_, align 8
+  %length_ = getelementptr inbounds %struct.C, %struct.C* %c, i32 0, i32 1
+  %2 = load i32, i32* %length_, align 8
+  %dec = add nsw i32 %2, -1
+  store i32 %dec, i32* %length_, align 8
+  %3 = bitcast %struct.C* %1 to i8*
+  call void @llvm.prefetch(i8* %3, i32 0, i32 0, i32 1)
+  %4 = load i32, i32* %length_, align 8
+  ret i32 %4
+}
+
+; Function Attrs: inaccessiblemem_or_argmemonly nounwind
+declare void @llvm.prefetch(i8* nocapture readonly, i32, i32, i32) 
+
+attributes #0 = { noinline nounwind }
+; We've explicitly removed the function attrs from llvm.prefetch so we get the defaults.
+; attributes #1 = { inaccessiblemem_or_argmemonly nounwind }
diff --git a/test/Transforms/InstCombine/preserved-analyses.ll b/test/Transforms/InstCombine/preserved-analyses.ll
new file mode 100644
index 000000000000..767304aecf35
--- /dev/null
+++ b/test/Transforms/InstCombine/preserved-analyses.ll
@@ -0,0 +1,33 @@
+; This is really testing that instcombine preserves analyses correctly, so we
+; don't care much about the code other than it is something instcombine can
+; transform.
+;
+; RUN: opt < %s -disable-output -debug-pass-manager 2>&1 -aa-pipeline=basic-aa,globals-aa \
+; RUN:    -passes='require<globals-aa>,function(require<aa>,instcombine),function(require<aa>)' \
+; RUN:    | FileCheck %s --check-prefix=AA
+; AA: Running analysis: GlobalsAA
+; AA: Running analysis: AAManager
+; AA: Running analysis: BasicAA
+; AA: Running pass: InstCombinePass on test
+; AA-NOT: Invalidating analysis: GlobalsAA
+; AA-NOT: Invalidating analysis: AAmanager
+; AA-NOT: Invalidating analysis: BasicAA
+; AA: Running pass: RequireAnalysisPass<{{.*}}AAManager
+; AA-NOT: Running analysis: GlobalsAA
+; AA-NOT: Running analysis: AAmanager
+; AA-NOT: Running analysis: BasicAA
+;
+; RUN: opt < %s -disable-output -debug-pass-manager 2>&1 \
+; RUN:    -passes='require<domtree>,instcombine,require<domtree>' \
+; RUN:    | FileCheck %s --check-prefix=DT
+; DT: Running analysis: DominatorTreeAnalysis
+; DT: Running pass: InstCombinePass on test
+; DT-NOT: Invalidating analysis: DominatorTreeAnalysis
+; DT: Running pass: RequireAnalysisPass<{{.*}}DominatorTreeAnalysis
+; DT-NOT: Running analysis: DominatorTreeAnalysis
+
+define i32 @test(i32 %A) {
+  %B = add i32 %A, 5
+  %C = add i32 %B, -5
+  ret i32 %C
+}
diff --git a/test/Transforms/InstCombine/readnone-maythrow.ll b/test/Transforms/InstCombine/readnone-maythrow.ll
new file mode 100644
index 000000000000..f01e90263a30
--- /dev/null
+++ b/test/Transforms/InstCombine/readnone-maythrow.ll
@@ -0,0 +1,34 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+declare void @readnone_but_may_throw() readnone
+
+define void @f_0(i32* %ptr) {
+; CHECK-LABEL: @f_0(
+entry:
+; CHECK:  store i32 10, i32* %ptr
+; CHECK-NEXT:  call void @readnone_but_may_throw()
+; CHECK-NEXT:  store i32 20, i32* %ptr, align 4
+; CHECK:  ret void
+
+  store i32 10, i32* %ptr
+  call void @readnone_but_may_throw()
+  store i32 20, i32* %ptr
+  ret void
+}
+
+define void @f_1(i1 %cond, i32* %ptr) {
+; CHECK-LABEL: @f_1(
+; CHECK:  store i32 10, i32* %ptr
+; CHECK-NEXT:  call void @readnone_but_may_throw()
+
+  store i32 10, i32* %ptr
+  call void @readnone_but_may_throw()
+  br i1 %cond, label %left, label %merge
+
+left:
+  store i32 20, i32* %ptr
+  br label %merge
+
+merge:
+  ret void
+}
diff --git a/test/Transforms/InstCombine/rem.ll b/test/Transforms/InstCombine/rem.ll
index 89a741c90707..7a7a134db9c5 100644
--- a/test/Transforms/InstCombine/rem.ll
+++ b/test/Transforms/InstCombine/rem.ll
@@ -1,28 +1,169 @@
-; This test makes sure that rem instructions are properly eliminated.
-;
 ; RUN: opt < %s -instcombine -S | FileCheck %s
-; END.
+
+define i64 @rem_signed(i64 %x1, i64 %y2) {
+; CHECK-LABEL: @rem_signed(
+; CHECK-NEXT:    [[R:%.*]] = srem i64 %x1, %y2
+; CHECK-NEXT:    ret i64 [[R]]
+;
+  %r = sdiv i64 %x1, %y2
+  %r7 = mul i64 %r, %y2
+  %r8 = sub i64 %x1, %r7
+  ret i64 %r8
+}
+
+define <4 x i32> @rem_signed_vec(<4 x i32> %t, <4 x i32> %u) {
+; CHECK-LABEL: @rem_signed_vec(
+; CHECK-NEXT:    [[K:%.*]] = srem <4 x i32> %t, %u
+; CHECK-NEXT:    ret <4 x i32> [[K]]
+;
+  %k = sdiv <4 x i32> %t, %u
+  %l = mul <4 x i32> %k, %u
+  %m = sub <4 x i32> %t, %l
+  ret <4 x i32> %m
+}
+
+define i64 @rem_unsigned(i64 %x1, i64 %y2) {
+; CHECK-LABEL: @rem_unsigned(
+; CHECK-NEXT:    [[R:%.*]] = urem i64 %x1, %y2
+; CHECK-NEXT:    ret i64 [[R]]
+;
+  %r = udiv i64 %x1, %y2
+  %r7 = mul i64 %r, %y2
+  %r8 = sub i64 %x1, %r7
+  ret i64 %r8
+}
+
+; PR28672 - https://llvm.org/bugs/show_bug.cgi?id=28672
+
+define i8 @big_divisor(i8 %x) {
+; CHECK-LABEL: @big_divisor(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i8 %x, -127
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 %x, 127
+; CHECK-NEXT:    [[REM:%.*]] = select i1 [[TMP1]], i8 %x, i8 [[TMP2]]
+; CHECK-NEXT:    ret i8 [[REM]]
+;
+  %rem = urem i8 %x, 129
+  ret i8 %rem
+}
+
+define i5 @biggest_divisor(i5 %x) {
+; CHECK-LABEL: @biggest_divisor(
+; CHECK-NEXT:    [[NOT_:%.*]] = icmp eq i5 %x, -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i1 [[NOT_]] to i5
+; CHECK-NEXT:    [[REM:%.*]] = add i5 [[TMP1]], %x
+; CHECK-NEXT:    ret i5 [[REM]]
+;
+  %rem = urem i5 %x, -1
+  ret i5 %rem
+}
+
+define <2 x i4> @big_divisor_vec(<2 x i4> %x) {
+; CHECK-LABEL: @big_divisor_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i4> [[X:%.*]], <i4 -3, i4 -3>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i4> [[X]], <i4 3, i4 3>
+; CHECK-NEXT:    [[REM:%.*]] = select <2 x i1> [[TMP1]], <2 x i4> [[X]], <2 x i4> [[TMP2]]
+; CHECK-NEXT:    ret <2 x i4> [[REM]]
+;
+  %rem = urem <2 x i4> %x, <i4 13, i4 13>
+  ret <2 x i4> %rem
+}
+
+define i8 @urem1(i8 %x, i8 %y) {
+; CHECK-LABEL: @urem1(
+; CHECK-NEXT:    [[A:%.*]] = urem i8 %x, %y
+; CHECK-NEXT:    ret i8 [[A]]
+;
+  %A = udiv i8 %x, %y
+  %B = mul i8 %A, %y
+  %C = sub i8 %x, %B
+  ret i8 %C
+}
+
+define i8 @srem1(i8 %x, i8 %y) {
+; CHECK-LABEL: @srem1(
+; CHECK-NEXT:    [[A:%.*]] = srem i8 %x, %y
+; CHECK-NEXT:    ret i8 [[A]]
+;
+  %A = sdiv i8 %x, %y
+  %B = mul i8 %A, %y
+  %C = sub i8 %x, %B
+  ret i8 %C
+}
+
+define i8 @urem2(i8 %x, i8 %y) {
+; CHECK-LABEL: @urem2(
+; CHECK-NEXT:    [[A:%.*]] = urem i8 %x, %y
+; CHECK-NEXT:    [[C:%.*]] = sub i8 0, [[A]]
+; CHECK-NEXT:    ret i8 [[C]]
+;
+  %A = udiv i8 %x, %y
+  %B = mul i8 %A, %y
+  %C = sub i8 %B, %x
+  ret i8 %C
+}
+
+define i8 @urem3(i8 %x) {
+; CHECK-LABEL: @urem3(
+; CHECK-NEXT:    [[A:%.*]] = urem i8 %x, 3
+; CHECK-NEXT:    [[B1:%.*]] = sub i8 %x, [[A]]
+; CHECK-NEXT:    [[C:%.*]] = add i8 [[B1]], %x
+; CHECK-NEXT:    ret i8 [[C]]
+;
+  %A = udiv i8 %x, 3
+  %B = mul i8 %A, -3
+  %C = sub i8 %x, %B
+  ret i8 %C
+}
+
+; (((X / Y) * Y) / Y) -> X / Y
+
+define i32 @sdiv_mul_sdiv(i32 %x, i32 %y) {
+; CHECK-LABEL: @sdiv_mul_sdiv(
+; CHECK-NEXT:    [[R:%.*]] = sdiv i32 %x, %y
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %div = sdiv i32 %x, %y
+  %mul = mul i32 %div, %y
+  %r = sdiv i32 %mul, %y
+  ret i32 %r
+}
+
+; (((X / Y) * Y) / Y) -> X / Y
+
+define i32 @udiv_mul_udiv(i32 %x, i32 %y) {
+; CHECK-LABEL: @udiv_mul_udiv(
+; CHECK-NEXT:    [[R:%.*]] = udiv i32 %x, %y
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %div = udiv i32 %x, %y
+  %mul = mul i32 %div, %y
+  %r = udiv i32 %mul, %y
+  ret i32 %r
+}
 
 define i32 @test1(i32 %A) {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT: ret i32 0
-	%B = srem i32 %A, 1	; ISA constant 0
-	ret i32 %B
+; CHECK-NEXT:    ret i32 0
+;
+  %B = srem i32 %A, 1	; ISA constant 0
+  ret i32 %B
 }
 
 define i32 @test2(i32 %A) {	; 0 % X = 0, we don't need to preserve traps
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT: ret i32 0
-	%B = srem i32 0, %A
-	ret i32 %B
+; CHECK-NEXT:    ret i32 0
+;
+  %B = srem i32 0, %A
+  ret i32 %B
 }
 
 define i32 @test3(i32 %A) {
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT: [[AND:%.*]] = and i32 %A, 7
-; CHECK-NEXT: ret i32 [[AND]]
-	%B = urem i32 %A, 8
-	ret i32 %B
+; CHECK-NEXT:    [[B:%.*]] = and i32 %A, 7
+; CHECK-NEXT:    ret i32 [[B]]
+;
+  %B = urem i32 %A, 8
+  ret i32 %B
 }
 
 define <2 x i32> @vec_power_of_2_constant_splat_divisor(<2 x i32> %A) {
@@ -45,12 +186,13 @@ define <2 x i19> @weird_vec_power_of_2_constant_splat_divisor(<2 x i19> %A) {
 
 define i1 @test3a(i32 %A) {
 ; CHECK-LABEL: @test3a(
-; CHECK-NEXT: [[AND:%.*]] = and i32 %A, 7
-; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[AND]], 0
-; CHECK-NEXT: ret i1 [[CMP]]
-	%B = srem i32 %A, -8
-	%C = icmp ne i32 %B, 0
-	ret i1 %C
+; CHECK-NEXT:    [[B1:%.*]] = and i32 %A, 7
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i32 [[B1]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %B = srem i32 %A, -8
+  %C = icmp ne i32 %B, 0
+  ret i1 %C
 }
 
 define <2 x i1> @test3a_vec(<2 x i32> %A) {
@@ -66,201 +208,221 @@ define <2 x i1> @test3a_vec(<2 x i32> %A) {
 
 define i32 @test4(i32 %X, i1 %C) {
 ; CHECK-LABEL: @test4(
-; CHECK-NEXT: [[SEL:%.*]] = select i1 %C, i32 0, i32 7
-; CHECK-NEXT: [[AND:%.*]] = and i32 [[SEL]], %X
-	%V = select i1 %C, i32 1, i32 8
-	%R = urem i32 %X, %V
-	ret i32 %R
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 %C, i32 0, i32 7
+; CHECK-NEXT:    [[R:%.*]] = and i32 [[TMP1]], %X
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %V = select i1 %C, i32 1, i32 8
+  %R = urem i32 %X, %V
+  ret i32 %R
 }
 
 define i32 @test5(i32 %X, i8 %B) {
 ; CHECK-LABEL: @test5(
-; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 %B to i32
-; CHECK-NEXT: [[SHL:%.*]] = shl nuw i32 32, [[ZEXT]]
-; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SHL]], -1
-; CHECK-NEXT: [[AND:%.*]] = and i32 [[ADD]], %X
-; CHECK-NEXT: ret i32 [[AND]]
-	%shift.upgrd.1 = zext i8 %B to i32
-	%Amt = shl i32 32, %shift.upgrd.1
-	%V = urem i32 %X, %Amt
-	ret i32 %V
+; CHECK-NEXT:    [[SHIFT_UPGRD_1:%.*]] = zext i8 %B to i32
+; CHECK-NEXT:    [[AMT:%.*]] = shl nuw i32 32, [[SHIFT_UPGRD_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[AMT]], -1
+; CHECK-NEXT:    [[V:%.*]] = and i32 [[TMP1]], %X
+; CHECK-NEXT:    ret i32 [[V]]
+;
+  %shift.upgrd.1 = zext i8 %B to i32
+  %Amt = shl i32 32, %shift.upgrd.1
+  %V = urem i32 %X, %Amt
+  ret i32 %V
 }
 
 define i32 @test6(i32 %A) {
 ; CHECK-LABEL: @test6(
-; CHECK-NEXT: ret i32 undef
-	%B = srem i32 %A, 0	;; undef
-	ret i32 %B
+; CHECK-NEXT:    ret i32 undef
+;
+  %B = srem i32 %A, 0	;; undef
+  ret i32 %B
 }
 
 define i32 @test7(i32 %A) {
 ; CHECK-LABEL: @test7(
-; CHECK-NEXT: ret i32 0
-	%B = mul i32 %A, 8
-	%C = srem i32 %B, 4
-	ret i32 %C
+; CHECK-NEXT:    ret i32 0
+;
+  %B = mul i32 %A, 8
+  %C = srem i32 %B, 4
+  ret i32 %C
 }
 
 define i32 @test8(i32 %A) {
 ; CHECK-LABEL: @test8(
-; CHECK-NEXT: ret i32 0
-	%B = shl i32 %A, 4
-	%C = srem i32 %B, 8
-	ret i32 %C
+; CHECK-NEXT:    ret i32 0
+;
+  %B = shl i32 %A, 4
+  %C = srem i32 %B, 8
+  ret i32 %C
 }
 
 define i32 @test9(i32 %A) {
 ; CHECK-LABEL: @test9(
-; CHECK-NEXT: ret i32 0
-	%B = mul i32 %A, 64
-	%C = urem i32 %B, 32
-	ret i32 %C
+; CHECK-NEXT:    ret i32 0
+;
+  %B = mul i32 %A, 64
+  %C = urem i32 %B, 32
+  ret i32 %C
 }
 
 define i32 @test10(i8 %c) {
 ; CHECK-LABEL: @test10(
-; CHECK-NEXT: ret i32 0
-	%tmp.1 = zext i8 %c to i32
-	%tmp.2 = mul i32 %tmp.1, 4
-	%tmp.3 = sext i32 %tmp.2 to i64
-	%tmp.5 = urem i64 %tmp.3, 4
-	%tmp.6 = trunc i64 %tmp.5 to i32
-	ret i32 %tmp.6
+; CHECK-NEXT:    ret i32 0
+;
+  %tmp.1 = zext i8 %c to i32
+  %tmp.2 = mul i32 %tmp.1, 4
+  %tmp.3 = sext i32 %tmp.2 to i64
+  %tmp.5 = urem i64 %tmp.3, 4
+  %tmp.6 = trunc i64 %tmp.5 to i32
+  ret i32 %tmp.6
 }
 
 define i32 @test11(i32 %i) {
 ; CHECK-LABEL: @test11(
-; CHECK-NEXT: ret i32 0
-	%tmp.1 = and i32 %i, -2
-	%tmp.3 = mul i32 %tmp.1, 2
-	%tmp.5 = urem i32 %tmp.3, 4
-	ret i32 %tmp.5
+; CHECK-NEXT:    ret i32 0
+;
+  %tmp.1 = and i32 %i, -2
+  %tmp.3 = mul i32 %tmp.1, 2
+  %tmp.5 = urem i32 %tmp.3, 4
+  ret i32 %tmp.5
 }
 
 define i32 @test12(i32 %i) {
 ; CHECK-LABEL: @test12(
-; CHECK-NEXT: ret i32 0
-	%tmp.1 = and i32 %i, -4
-	%tmp.5 = srem i32 %tmp.1, 2
-	ret i32 %tmp.5
+; CHECK-NEXT:    ret i32 0
+;
+  %tmp.1 = and i32 %i, -4
+  %tmp.5 = srem i32 %tmp.1, 2
+  ret i32 %tmp.5
 }
 
 define i32 @test13(i32 %i) {
 ; CHECK-LABEL: @test13(
-; CHECK-NEXT: ret i32 0
-	%x = srem i32 %i, %i
-	ret i32 %x
+; CHECK-NEXT:    ret i32 0
+;
+  %x = srem i32 %i, %i
+  ret i32 %x
 }
 
 define i64 @test14(i64 %x, i32 %y) {
 ; CHECK-LABEL: @test14(
-; CHECK-NEXT: [[SHL:%.*]] = shl i32 1, %y
-; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[SHL]] to i64
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[ZEXT]], -1
-; CHECK-NEXT: [[AND:%.*]] = and i64 [[ADD]], %x
-; CHECK-NEXT: ret i64 [[AND]]
-	%shl = shl i32 1, %y
-	%zext = zext i32 %shl to i64
-	%urem = urem i64 %x, %zext
-	ret i64 %urem
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 1, %y
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[SHL]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[ZEXT]], -1
+; CHECK-NEXT:    [[UREM:%.*]] = and i64 [[TMP1]], %x
+; CHECK-NEXT:    ret i64 [[UREM]]
+;
+  %shl = shl i32 1, %y
+  %zext = zext i32 %shl to i64
+  %urem = urem i64 %x, %zext
+  ret i64 %urem
 }
 
 define i64 @test15(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test15(
-; CHECK-NEXT: [[SHL:%.*]] = shl nuw i32 1, %y
-; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SHL]], -1
-; CHECK-NEXT: [[AND:%.*]] = and i32 [[ADD]], %x
-; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[AND]] to i64
-; CHECK-NEXT: ret i64 [[ZEXT]]
-	%shl = shl i32 1, %y
-	%zext0 = zext i32 %shl to i64
-	%zext1 = zext i32 %x to i64
-	%urem = urem i64 %zext1, %zext0
-	ret i64 %urem
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 1, %y
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[SHL]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], %x
+; CHECK-NEXT:    [[UREM:%.*]] = zext i32 [[TMP2]] to i64
+; CHECK-NEXT:    ret i64 [[UREM]]
+;
+  %shl = shl i32 1, %y
+  %zext0 = zext i32 %shl to i64
+  %zext1 = zext i32 %x to i64
+  %urem = urem i64 %zext1, %zext0
+  ret i64 %urem
 }
 
 define i32 @test16(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test16(
-; CHECK-NEXT: [[SHR:%.*]] = lshr i32 %y, 11
-; CHECK-NEXT: [[AND:%.*]] = and i32 [[SHR]], 4
-; CHECK-NEXT: [[OR:%.*]] = or i32 [[AND]], 3
-; CHECK-NEXT: [[REM:%.*]] = and i32 [[OR]], %x
-; CHECK-NEXT: ret i32 [[REM]]
-	%shr = lshr i32 %y, 11
-	%and = and i32 %shr, 4
-	%add = add i32 %and, 4
-	%rem = urem i32 %x, %add
-	ret i32 %rem
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 %y, 11
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[SHR]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[AND]], 3
+; CHECK-NEXT:    [[REM:%.*]] = and i32 [[TMP1]], %x
+; CHECK-NEXT:    ret i32 [[REM]]
+;
+  %shr = lshr i32 %y, 11
+  %and = and i32 %shr, 4
+  %add = add i32 %and, 4
+  %rem = urem i32 %x, %add
+  ret i32 %rem
 }
 
 define i32 @test17(i32 %X) {
 ; CHECK-LABEL: @test17(
-; CHECK-NEXT: icmp ne i32 %X, 1
-; CHECK-NEXT: zext i1
-; CHECK-NEXT: ret
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 %X, 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
   %A = urem i32 1, %X
   ret i32 %A
 }
 
 define i32 @test18(i16 %x, i32 %y) {
-; CHECK: @test18
-; CHECK-NEXT: [[SHL:%.*]] = shl i16 %x, 3
-; CHECK-NEXT: [[AND:%.*]] = and i16 [[SHL]], 32
-; CHECK-NEXT: [[XOR:%.*]] = xor i16 [[AND]], 63
-; CHECK-NEXT: [[EXT:%.*]] = zext i16 [[XOR]] to i32
-; CHECK-NEXT: [[REM:%.*]] = and i32 [[EXT]], %y
-; CHECK-NEXT: ret i32 [[REM]]
-	%1 = and i16 %x, 4
-	%2 = icmp ne i16 %1, 0
-	%3 = select i1 %2, i32 32, i32 64
-	%4 = urem i32 %y, %3
-	ret i32 %4
+; CHECK-LABEL: @test18(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i16 %x, 3
+; CHECK-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 32
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP2]], 63
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], %y
+; CHECK-NEXT:    ret i32 [[TMP5]]
+;
+  %1 = and i16 %x, 4
+  %2 = icmp ne i16 %1, 0
+  %3 = select i1 %2, i32 32, i32 64
+  %4 = urem i32 %y, %3
+  ret i32 %4
 }
 
 define i32 @test19(i32 %x, i32 %y) {
-; CHECK: @test19
-; CHECK-NEXT: [[SHL1:%.*]] = shl i32 1, %x
-; CHECK-NEXT: [[SHL2:%.*]] = shl i32 1, %y
-; CHECK-NEXT: [[AND:%.*]] = and i32 [[SHL1]], [[SHL2]]
-; CHECK-NEXT: [[ADD:%.*]] = add i32 [[AND]], [[SHL1]]
-; CHECK-NEXT: [[SUB:%.*]] = add i32 [[ADD]], -1
-; CHECK-NEXT: [[REM:%.*]] = and i32 [[SUB]], %y
-; CHECK-NEXT: ret i32 [[REM]]
-	%A = shl i32 1, %x
-	%B = shl i32 1, %y
-	%C = and i32 %A, %B
-	%D = add i32 %C, %A
-	%E = urem i32 %y, %D
-	ret i32 %E
+; CHECK-LABEL: @test19(
+; CHECK-NEXT:    [[A:%.*]] = shl i32 1, %x
+; CHECK-NEXT:    [[B:%.*]] = shl i32 1, %y
+; CHECK-NEXT:    [[C:%.*]] = and i32 [[A]], [[B]]
+; CHECK-NEXT:    [[D:%.*]] = add i32 [[C]], [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[D]], -1
+; CHECK-NEXT:    [[E:%.*]] = and i32 [[TMP1]], %y
+; CHECK-NEXT:    ret i32 [[E]]
+;
+  %A = shl i32 1, %x
+  %B = shl i32 1, %y
+  %C = and i32 %A, %B
+  %D = add i32 %C, %A
+  %E = urem i32 %y, %D
+  ret i32 %E
 }
 
 define <2 x i64> @test20(<2 x i64> %X, <2 x i1> %C) {
 ; CHECK-LABEL: @test20(
-; CHECK-NEXT: select <2 x i1> %C, <2 x i64> <i64 1, i64 2>, <2 x i64> zeroinitializer
-; CHECK-NEXT: ret <2 x i64>
-	%V = select <2 x i1> %C, <2 x i64> <i64 1, i64 2>, <2 x i64> <i64 8, i64 9>
-	%R = urem <2 x i64> %V, <i64 2, i64 3>
-	ret <2 x i64> %R
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> %C, <2 x i64> <i64 1, i64 2>, <2 x i64> zeroinitializer
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %V = select <2 x i1> %C, <2 x i64> <i64 1, i64 2>, <2 x i64> <i64 8, i64 9>
+  %R = urem <2 x i64> %V, <i64 2, i64 3>
+  ret <2 x i64> %R
 }
 
-define i32 @test21(i1 %c0, i32* %val) {
+define i32 @test21(i1 %c0, i32* %p) {
 ; CHECK-LABEL: @test21(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 %c0, label %if.then, label %if.end
+; CHECK:       if.then:
+; CHECK-NEXT:    [[V:%.*]] = load volatile i32, i32* %p, align 4
+; CHECK-NEXT:    [[PHITMP:%.*]] = srem i32 [[V]], 5
+; CHECK-NEXT:    br label %if.end
+; CHECK:       if.end:
+; CHECK-NEXT:    [[LHS:%.*]] = phi i32 [ [[PHITMP]], %if.then ], [ 0, %entry ]
+; CHECK-NEXT:    ret i32 [[LHS]]
+;
 entry:
   br i1 %c0, label %if.then, label %if.end
 
 if.then:
-; CHECK: if.then:
-; CHECK-NEXT:  %v = load volatile i32, i32* %val, align 4
-; CHECK-NEXT:  %phitmp = srem i32 %v, 5
-
-  %v = load volatile i32, i32* %val
+  %v = load volatile i32, i32* %p
   br label %if.end
 
 if.end:
-; CHECK: if.end:
-; CHECK-NEXT:  %lhs = phi i32 [ %phitmp, %if.then ], [ 0, %entry ]
-; CHECK-NEXT:  ret i32 %lhs
-
   %lhs = phi i32 [ %v, %if.then ], [ 5, %entry ]
   %rem = srem i32 %lhs, 5
   ret i32 %rem
@@ -269,28 +431,34 @@ if.end:
 @a = common global [5 x i16] zeroinitializer, align 2
 @b = common global i16 0, align 2
 
-define i32 @pr27968_0(i1 %c0, i32* %val) {
+define i32 @pr27968_0(i1 %c0, i32* %p) {
 ; CHECK-LABEL: @pr27968_0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 %c0, label %if.then, label %if.end
+; CHECK:       if.then:
+; CHECK-NEXT:    [[V:%.*]] = load volatile i32, i32* %p, align 4
+; CHECK-NEXT:    br label %if.end
+; CHECK:       if.end:
+; CHECK-NEXT:    [[LHS:%.*]] = phi i32 [ [[V]], %if.then ], [ 5, %entry ]
+; CHECK-NEXT:    br i1 icmp eq (i16* getelementptr inbounds ([5 x i16], [5 x i16]* @a, i64 0, i64 4), i16* @b), label [[REM]].is.safe, label [[REM]].is.unsafe
+; CHECK:       rem.is.safe:
+; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[LHS]], zext (i1 icmp eq (i16* getelementptr inbounds ([5 x i16], [5 x i16]* @a, i64 0, i64 4), i16* @b) to i32)
+; CHECK-NEXT:    ret i32 [[REM]]
+; CHECK:       rem.is.unsafe:
+; CHECK-NEXT:    ret i32 0
+;
 entry:
   br i1 %c0, label %if.then, label %if.end
 
 if.then:
-  %v = load volatile i32, i32* %val
+  %v = load volatile i32, i32* %p
   br label %if.end
 
-; CHECK: if.then:
-; CHECK-NOT: srem
-; CHECK:  br label %if.end
-
 if.end:
   %lhs = phi i32 [ %v, %if.then ], [ 5, %entry ]
   br i1 icmp eq (i16* getelementptr inbounds ([5 x i16], [5 x i16]* @a, i64 0, i64 4), i16* @b), label %rem.is.safe, label %rem.is.unsafe
 
 rem.is.safe:
-; CHECK: rem.is.safe:
-; CHECK-NEXT:  %rem = srem i32 %lhs, zext (i1 icmp eq (i16* getelementptr inbounds ([5 x i16], [5 x i16]* @a, i64 0, i64 4), i16* @b) to i32)
-; CHECK-NEXT:  ret i32 %rem
-
   %rem = srem i32 %lhs, zext (i1 icmp eq (i16* getelementptr inbounds ([5 x i16], [5 x i16]* @a, i64 0, i64 4), i16* @b) to i32)
   ret i32 %rem
 
@@ -298,19 +466,29 @@ rem.is.unsafe:
   ret i32 0
 }
 
-define i32 @pr27968_1(i1 %c0, i1 %always_false, i32* %val) {
+define i32 @pr27968_1(i1 %c0, i1 %always_false, i32* %p) {
 ; CHECK-LABEL: @pr27968_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 %c0, label %if.then, label %if.end
+; CHECK:       if.then:
+; CHECK-NEXT:    [[V:%.*]] = load volatile i32, i32* %p, align 4
+; CHECK-NEXT:    br label %if.end
+; CHECK:       if.end:
+; CHECK-NEXT:    [[LHS:%.*]] = phi i32 [ [[V]], %if.then ], [ 5, %entry ]
+; CHECK-NEXT:    br i1 %always_false, label [[REM]].is.safe, label [[REM]].is.unsafe
+; CHECK:       rem.is.safe:
+; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[LHS]], -2147483648
+; CHECK-NEXT:    ret i32 [[REM]]
+; CHECK:       rem.is.unsafe:
+; CHECK-NEXT:    ret i32 0
+;
 entry:
   br i1 %c0, label %if.then, label %if.end
 
 if.then:
-  %v = load volatile i32, i32* %val
+  %v = load volatile i32, i32* %p
   br label %if.end
 
-; CHECK: if.then:
-; CHECK-NOT: srem
-; CHECK:  br label %if.end
-
 if.end:
   %lhs = phi i32 [ %v, %if.then ], [ 5, %entry ]
   br i1 %always_false, label %rem.is.safe, label %rem.is.unsafe
@@ -319,36 +497,38 @@ rem.is.safe:
   %rem = srem i32 %lhs, -2147483648
   ret i32 %rem
 
-; CHECK: rem.is.safe:
-; CHECK-NEXT:  %rem = srem i32 %lhs, -2147483648
-; CHECK-NEXT:  ret i32 %rem
-
 rem.is.unsafe:
   ret i32 0
 }
 
-define i32 @pr27968_2(i1 %c0, i32* %val) {
+define i32 @pr27968_2(i1 %c0, i32* %p) {
 ; CHECK-LABEL: @pr27968_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 %c0, label %if.then, label %if.end
+; CHECK:       if.then:
+; CHECK-NEXT:    [[V:%.*]] = load volatile i32, i32* %p, align 4
+; CHECK-NEXT:    br label %if.end
+; CHECK:       if.end:
+; CHECK-NEXT:    [[LHS:%.*]] = phi i32 [ [[V]], %if.then ], [ 5, %entry ]
+; CHECK-NEXT:    br i1 icmp eq (i16* getelementptr inbounds ([5 x i16], [5 x i16]* @a, i64 0, i64 4), i16* @b), label [[REM]].is.safe, label [[REM]].is.unsafe
+; CHECK:       rem.is.safe:
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[LHS]], zext (i1 icmp eq (i16* getelementptr inbounds ([5 x i16], [5 x i16]* @a, i64 0, i64 4), i16* @b) to i32)
+; CHECK-NEXT:    ret i32 [[REM]]
+; CHECK:       rem.is.unsafe:
+; CHECK-NEXT:    ret i32 0
+;
 entry:
   br i1 %c0, label %if.then, label %if.end
 
 if.then:
-  %v = load volatile i32, i32* %val
+  %v = load volatile i32, i32* %p
   br label %if.end
 
-; CHECK: if.then:
-; CHECK-NOT: urem
-; CHECK:  br label %if.end
-
 if.end:
   %lhs = phi i32 [ %v, %if.then ], [ 5, %entry ]
   br i1 icmp eq (i16* getelementptr inbounds ([5 x i16], [5 x i16]* @a, i64 0, i64 4), i16* @b), label %rem.is.safe, label %rem.is.unsafe
 
 rem.is.safe:
-; CHECK: rem.is.safe:
-; CHECK-NEXT:  %rem = urem i32 %lhs, zext (i1 icmp eq (i16* getelementptr inbounds ([5 x i16], [5 x i16]* @a, i64 0, i64 4), i16* @b) to i32)
-; CHECK-NEXT:  ret i32 %rem
-
   %rem = urem i32 %lhs, zext (i1 icmp eq (i16* getelementptr inbounds ([5 x i16], [5 x i16]* @a, i64 0, i64 4), i16* @b) to i32)
   ret i32 %rem
 
@@ -356,20 +536,29 @@ rem.is.unsafe:
   ret i32 0
 }
 
-define i32 @pr27968_3(i1 %c0, i1 %always_false, i32* %val) {
+define i32 @pr27968_3(i1 %c0, i1 %always_false, i32* %p) {
 ; CHECK-LABEL: @pr27968_3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 %c0, label %if.then, label %if.end
+; CHECK:       if.then:
+; CHECK-NEXT:    [[V:%.*]] = load volatile i32, i32* %p, align 4
+; CHECK-NEXT:    [[PHITMP:%.*]] = and i32 [[V]], 2147483647
+; CHECK-NEXT:    br label %if.end
+; CHECK:       if.end:
+; CHECK-NEXT:    [[LHS:%.*]] = phi i32 [ [[PHITMP]], %if.then ], [ 5, %entry ]
+; CHECK-NEXT:    br i1 %always_false, label %rem.is.safe, label %rem.is.unsafe
+; CHECK:       rem.is.safe:
+; CHECK-NEXT:    ret i32 [[LHS]]
+; CHECK:       rem.is.unsafe:
+; CHECK-NEXT:    ret i32 0
+;
 entry:
   br i1 %c0, label %if.then, label %if.end
 
 if.then:
-  %v = load volatile i32, i32* %val
+  %v = load volatile i32, i32* %p
   br label %if.end
 
-; CHECK: if.then:
-; CHECK-NEXT:  %v = load volatile i32, i32* %val, align 4
-; CHECK-NEXT:  %phitmp = and i32 %v, 2147483647
-; CHECK-NEXT:  br label %if.end
-
 if.end:
   %lhs = phi i32 [ %v, %if.then ], [ 5, %entry ]
   br i1 %always_false, label %rem.is.safe, label %rem.is.unsafe
@@ -381,3 +570,4 @@ rem.is.safe:
 rem.is.unsafe:
   ret i32 0
 }
+
diff --git a/test/Transforms/InstCombine/select-bitext.ll b/test/Transforms/InstCombine/select-bitext.ll
index 6e374f5221d1..b66a9eef4ab6 100644
--- a/test/Transforms/InstCombine/select-bitext.ll
+++ b/test/Transforms/InstCombine/select-bitext.ll
@@ -100,7 +100,7 @@ define <2 x i64> @trunc_sel_larger_sext_vec(<2 x i32> %a, <2 x i1> %cmp) {
 ; CHECK-LABEL: @trunc_sel_larger_sext_vec(
 ; CHECK-NEXT:    [[TRUNC:%.*]] = zext <2 x i32> %a to <2 x i64>
 ; CHECK-NEXT:    [[SEXT:%.*]] = shl <2 x i64> [[TRUNC]], <i64 48, i64 48>
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i64> [[SEXT]], <i64 48, i64 48>
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr exact <2 x i64> [[SEXT]], <i64 48, i64 48>
 ; CHECK-NEXT:    [[EXT:%.*]] = select <2 x i1> %cmp, <2 x i64> [[TMP1]], <2 x i64> <i64 42, i64 43>
 ; CHECK-NEXT:    ret <2 x i64> [[EXT]]
 ;
@@ -127,7 +127,7 @@ define <2 x i32> @trunc_sel_smaller_sext_vec(<2 x i64> %a, <2 x i1> %cmp) {
 ; CHECK-LABEL: @trunc_sel_smaller_sext_vec(
 ; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <2 x i64> %a to <2 x i32>
 ; CHECK-NEXT:    [[SEXT:%.*]] = shl <2 x i32> [[TRUNC]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i32> [[SEXT]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr exact <2 x i32> [[SEXT]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[EXT:%.*]] = select <2 x i1> %cmp, <2 x i32> [[TMP1]], <2 x i32> <i32 42, i32 43>
 ; CHECK-NEXT:    ret <2 x i32> [[EXT]]
 ;
@@ -153,7 +153,7 @@ define i32 @trunc_sel_equal_sext(i32 %a, i1 %cmp) {
 define <2 x i32> @trunc_sel_equal_sext_vec(<2 x i32> %a, <2 x i1> %cmp) {
 ; CHECK-LABEL: @trunc_sel_equal_sext_vec(
 ; CHECK-NEXT:    [[SEXT:%.*]] = shl <2 x i32> %a, <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i32> [[SEXT]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr exact <2 x i32> [[SEXT]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[EXT:%.*]] = select <2 x i1> %cmp, <2 x i32> [[TMP1]], <2 x i32> <i32 42, i32 43>
 ; CHECK-NEXT:    ret <2 x i32> [[EXT]]
 ;
diff --git a/test/Transforms/InstCombine/select-cmp-br.ll b/test/Transforms/InstCombine/select-cmp-br.ll
index 1dc7e153f5fb..59384ab7b1f0 100644
--- a/test/Transforms/InstCombine/select-cmp-br.ll
+++ b/test/Transforms/InstCombine/select-cmp-br.ll
@@ -1,155 +1,263 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; Replace a 'select' with 'or' in 'select - cmp [eq|ne] - br' sequence
 ; RUN: opt -instcombine -S < %s | FileCheck %s
 
-%C = type <{ %struct.S }>
 %struct.S = type { i64*, i32, i32 }
+%C = type <{ %struct.S }>
 
-declare void @bar(%struct.S *) #1
+declare void @bar(%struct.S*)
 declare void @foobar()
 
-define void @test1(%C*) {
+define void @test1(%C* %arg) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[C:%.*]], %C* [[ARG:%.*]], i64 0, i32 0, i32 0
+; CHECK-NEXT:    [[M:%.*]] = load i64*, i64** [[TMP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[C]], %C* [[ARG]], i64 1, i32 0, i32 0
+; CHECK-NEXT:    [[N:%.*]] = load i64*, i64** [[TMP1]], align 8
+; CHECK-NEXT:    [[NOT_TMP5:%.*]] = icmp ne i64* [[M]], [[N]]
+; CHECK-NEXT:    [[TMP71:%.*]] = icmp eq %C* [[ARG]], null
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP71]], [[NOT_TMP5]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[BB10:%.*]], label [[BB8:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    ret void
+; CHECK:       bb8:
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[C]], %C* [[ARG]], i64 0, i32 0
+; CHECK-NEXT:    tail call void @bar(%struct.S* [[TMP9]])
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb10:
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[M]], i64 9
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[TMP2]] to i64 (%C*)**
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64 (%C*)*, i64 (%C*)** [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call i64 [[TMP4]](%C* [[ARG]])
+; CHECK-NEXT:    br label [[BB]]
+;
 entry:
-  %1 = getelementptr inbounds %C, %C* %0, i64 0, i32 0, i32 0
-  %m = load i64*, i64** %1, align 8
-  %2 = getelementptr inbounds %C, %C* %0, i64 1, i32 0, i32 0
-  %n = load i64*, i64** %2, align 8
-  %3 = getelementptr inbounds i64, i64* %m, i64 9
-  %4 = bitcast i64* %3 to i64 (%C*)**
-  %5 = load i64 (%C*)*, i64 (%C*)** %4, align 8
-  %6 = icmp eq i64* %m, %n
-  %7 = select i1 %6, %C* %0, %C* null
-  %8 = icmp eq %C* %7, null
-  br i1 %8, label %12, label %10
-
-; <label>:9                                       ; preds = %10, %12
+  %tmp = getelementptr inbounds %C, %C* %arg, i64 0, i32 0, i32 0
+  %m = load i64*, i64** %tmp, align 8
+  %tmp1 = getelementptr inbounds %C, %C* %arg, i64 1, i32 0, i32 0
+  %n = load i64*, i64** %tmp1, align 8
+  %tmp2 = getelementptr inbounds i64, i64* %m, i64 9
+  %tmp3 = bitcast i64* %tmp2 to i64 (%C*)**
+  %tmp4 = load i64 (%C*)*, i64 (%C*)** %tmp3, align 8
+  %tmp5 = icmp eq i64* %m, %n
+  %tmp6 = select i1 %tmp5, %C* %arg, %C* null
+  %tmp7 = icmp eq %C* %tmp6, null
+  br i1 %tmp7, label %bb10, label %bb8
+
+bb:                                               ; preds = %bb10, %bb8
   ret void
 
-; <label>:10                                      ; preds = %entry
-  %11 = getelementptr inbounds %C, %C* %7, i64 0, i32 0
-  tail call void @bar(%struct.S* %11)
-  br label %9
+bb8:                                              ; preds = %entry
+  %tmp9 = getelementptr inbounds %C, %C* %tmp6, i64 0, i32 0
+  tail call void @bar(%struct.S* %tmp9)
+  br label %bb
 
-; <label>:12                                      ; preds = %entry
-  %13 = tail call i64 %5(%C* %0)
-  br label %9
-; CHECK-LABEL: @test1(
-; CHECK-NOT: select
-; CHECK: or
-; CHECK-NOT: select
+bb10:                                             ; preds = %entry
+  %tmp11 = tail call i64 %tmp4(%C* %arg)
+  br label %bb
 }
 
-define void @test2(%C*) {
+define void @test2(%C* %arg) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[C:%.*]], %C* [[ARG:%.*]], i64 0, i32 0, i32 0
+; CHECK-NEXT:    [[M:%.*]] = load i64*, i64** [[TMP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[C]], %C* [[ARG]], i64 1, i32 0, i32 0
+; CHECK-NEXT:    [[N:%.*]] = load i64*, i64** [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64* [[M]], [[N]]
+; CHECK-NEXT:    [[TMP71:%.*]] = icmp eq %C* [[ARG]], null
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP71]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[BB10:%.*]], label [[BB8:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    ret void
+; CHECK:       bb8:
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[C]], %C* [[ARG]], i64 0, i32 0
+; CHECK-NEXT:    tail call void @bar(%struct.S* [[TMP9]])
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb10:
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[M]], i64 9
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[TMP2]] to i64 (%C*)**
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64 (%C*)*, i64 (%C*)** [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call i64 [[TMP4]](%C* [[ARG]])
+; CHECK-NEXT:    br label [[BB]]
+;
 entry:
-  %1 = getelementptr inbounds %C, %C* %0, i64 0, i32 0, i32 0
-  %m = load i64*, i64** %1, align 8
-  %2 = getelementptr inbounds %C, %C* %0, i64 1, i32 0, i32 0
-  %n = load i64*, i64** %2, align 8
-  %3 = getelementptr inbounds i64, i64* %m, i64 9
-  %4 = bitcast i64* %3 to i64 (%C*)**
-  %5 = load i64 (%C*)*, i64 (%C*)** %4, align 8
-  %6 = icmp eq i64* %m, %n
-  %7 = select i1 %6, %C* null, %C* %0
-  %8 = icmp eq %C* %7, null
-  br i1 %8, label %12, label %10
-
-; <label>:9                                       ; preds = %10, %12
+  %tmp = getelementptr inbounds %C, %C* %arg, i64 0, i32 0, i32 0
+  %m = load i64*, i64** %tmp, align 8
+  %tmp1 = getelementptr inbounds %C, %C* %arg, i64 1, i32 0, i32 0
+  %n = load i64*, i64** %tmp1, align 8
+  %tmp2 = getelementptr inbounds i64, i64* %m, i64 9
+  %tmp3 = bitcast i64* %tmp2 to i64 (%C*)**
+  %tmp4 = load i64 (%C*)*, i64 (%C*)** %tmp3, align 8
+  %tmp5 = icmp eq i64* %m, %n
+  %tmp6 = select i1 %tmp5, %C* null, %C* %arg
+  %tmp7 = icmp eq %C* %tmp6, null
+  br i1 %tmp7, label %bb10, label %bb8
+
+bb:                                               ; preds = %bb10, %bb8
   ret void
 
-; <label>:10                                      ; preds = %entry
-  %11 = getelementptr inbounds %C, %C* %7, i64 0, i32 0
-  tail call void @bar(%struct.S* %11)
-  br label %9
+bb8:                                              ; preds = %entry
+  %tmp9 = getelementptr inbounds %C, %C* %tmp6, i64 0, i32 0
+  tail call void @bar(%struct.S* %tmp9)
+  br label %bb
 
-; <label>:12                                      ; preds = %entry
-  %13 = tail call i64 %5(%C* %0)
-  br label %9
-; CHECK-LABEL: @test2(
-; CHECK-NOT: select
-; CHECK: or
-; CHECK-NOT: select
+bb10:                                             ; preds = %entry
+  %tmp11 = tail call i64 %tmp4(%C* %arg)
+  br label %bb
 }
 
-define void @test3(%C*) {
+define void @test3(%C* %arg) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[C:%.*]], %C* [[ARG:%.*]], i64 0, i32 0, i32 0
+; CHECK-NEXT:    [[M:%.*]] = load i64*, i64** [[TMP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[C]], %C* [[ARG]], i64 1, i32 0, i32 0
+; CHECK-NEXT:    [[N:%.*]] = load i64*, i64** [[TMP1]], align 8
+; CHECK-NEXT:    [[NOT_TMP5:%.*]] = icmp ne i64* [[M]], [[N]]
+; CHECK-NEXT:    [[TMP71:%.*]] = icmp eq %C* [[ARG]], null
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP71]], [[NOT_TMP5]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[BB10:%.*]], label [[BB8:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    ret void
+; CHECK:       bb8:
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[C]], %C* [[ARG]], i64 0, i32 0
+; CHECK-NEXT:    tail call void @bar(%struct.S* [[TMP9]])
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb10:
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[M]], i64 9
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[TMP2]] to i64 (%C*)**
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64 (%C*)*, i64 (%C*)** [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call i64 [[TMP4]](%C* [[ARG]])
+; CHECK-NEXT:    br label [[BB]]
+;
 entry:
-  %1 = getelementptr inbounds %C, %C* %0, i64 0, i32 0, i32 0
-  %m = load i64*, i64** %1, align 8
-  %2 = getelementptr inbounds %C, %C* %0, i64 1, i32 0, i32 0
-  %n = load i64*, i64** %2, align 8
-  %3 = getelementptr inbounds i64, i64* %m, i64 9
-  %4 = bitcast i64* %3 to i64 (%C*)**
-  %5 = load i64 (%C*)*, i64 (%C*)** %4, align 8
-  %6 = icmp eq i64* %m, %n
-  %7 = select i1 %6, %C* %0, %C* null
-  %8 = icmp ne %C* %7, null
-  br i1 %8, label %10, label %12
-
-; <label>:9                                       ; preds = %10, %12
+  %tmp = getelementptr inbounds %C, %C* %arg, i64 0, i32 0, i32 0
+  %m = load i64*, i64** %tmp, align 8
+  %tmp1 = getelementptr inbounds %C, %C* %arg, i64 1, i32 0, i32 0
+  %n = load i64*, i64** %tmp1, align 8
+  %tmp2 = getelementptr inbounds i64, i64* %m, i64 9
+  %tmp3 = bitcast i64* %tmp2 to i64 (%C*)**
+  %tmp4 = load i64 (%C*)*, i64 (%C*)** %tmp3, align 8
+  %tmp5 = icmp eq i64* %m, %n
+  %tmp6 = select i1 %tmp5, %C* %arg, %C* null
+  %tmp7 = icmp ne %C* %tmp6, null
+  br i1 %tmp7, label %bb8, label %bb10
+
+bb:                                               ; preds = %bb10, %bb8
   ret void
 
-; <label>:10                                      ; preds = %entry
-  %11 = getelementptr inbounds %C, %C* %7, i64 0, i32 0
-  tail call void @bar(%struct.S* %11)
-  br label %9
+bb8:                                              ; preds = %entry
+  %tmp9 = getelementptr inbounds %C, %C* %tmp6, i64 0, i32 0
+  tail call void @bar(%struct.S* %tmp9)
+  br label %bb
 
-; <label>:12                                      ; preds = %entry
-  %13 = tail call i64 %5(%C* %0)
-  br label %9
-; CHECK-LABEL: @test3(
-; CHECK-NOT: select
-; CHECK: or
-; CHECK-NOT: select
+bb10:                                             ; preds = %entry
+  %tmp11 = tail call i64 %tmp4(%C* %arg)
+  br label %bb
 }
 
-define void @test4(%C*) {
+define void @test4(%C* %arg) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[C:%.*]], %C* [[ARG:%.*]], i64 0, i32 0, i32 0
+; CHECK-NEXT:    [[M:%.*]] = load i64*, i64** [[TMP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[C]], %C* [[ARG]], i64 1, i32 0, i32 0
+; CHECK-NEXT:    [[N:%.*]] = load i64*, i64** [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64* [[M]], [[N]]
+; CHECK-NEXT:    [[TMP71:%.*]] = icmp eq %C* [[ARG]], null
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP71]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[BB10:%.*]], label [[BB8:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    ret void
+; CHECK:       bb8:
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[C]], %C* [[ARG]], i64 0, i32 0
+; CHECK-NEXT:    tail call void @bar(%struct.S* [[TMP9]])
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb10:
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[M]], i64 9
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[TMP2]] to i64 (%C*)**
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64 (%C*)*, i64 (%C*)** [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call i64 [[TMP4]](%C* [[ARG]])
+; CHECK-NEXT:    br label [[BB]]
+;
 entry:
-  %1 = getelementptr inbounds %C, %C* %0, i64 0, i32 0, i32 0
-  %m = load i64*, i64** %1, align 8
-  %2 = getelementptr inbounds %C, %C* %0, i64 1, i32 0, i32 0
-  %n = load i64*, i64** %2, align 8
-  %3 = getelementptr inbounds i64, i64* %m, i64 9
-  %4 = bitcast i64* %3 to i64 (%C*)**
-  %5 = load i64 (%C*)*, i64 (%C*)** %4, align 8
-  %6 = icmp eq i64* %m, %n
-  %7 = select i1 %6, %C* null, %C* %0
-  %8 = icmp ne %C* %7, null
-  br i1 %8, label %10, label %12
-
-; <label>:9                                       ; preds = %10, %12
+  %tmp = getelementptr inbounds %C, %C* %arg, i64 0, i32 0, i32 0
+  %m = load i64*, i64** %tmp, align 8
+  %tmp1 = getelementptr inbounds %C, %C* %arg, i64 1, i32 0, i32 0
+  %n = load i64*, i64** %tmp1, align 8
+  %tmp2 = getelementptr inbounds i64, i64* %m, i64 9
+  %tmp3 = bitcast i64* %tmp2 to i64 (%C*)**
+  %tmp4 = load i64 (%C*)*, i64 (%C*)** %tmp3, align 8
+  %tmp5 = icmp eq i64* %m, %n
+  %tmp6 = select i1 %tmp5, %C* null, %C* %arg
+  %tmp7 = icmp ne %C* %tmp6, null
+  br i1 %tmp7, label %bb8, label %bb10
+
+bb:                                               ; preds = %bb10, %bb8
   ret void
 
-; <label>:10                                      ; preds = %entry
-  %11 = getelementptr inbounds %C, %C* %7, i64 0, i32 0
-  tail call void @bar(%struct.S* %11)
-  br label %9
+bb8:                                              ; preds = %entry
+  %tmp9 = getelementptr inbounds %C, %C* %tmp6, i64 0, i32 0
+  tail call void @bar(%struct.S* %tmp9)
+  br label %bb
 
-; <label>:12                                      ; preds = %entry
-  %13 = tail call i64 %5(%C* %0)
-  br label %9
-; CHECK-LABEL: @test4(
-; CHECK-NOT: select
-; CHECK: or
-; CHECK-NOT: select
+bb10:                                             ; preds = %entry
+  %tmp11 = tail call i64 %tmp4(%C* %arg)
+  br label %bb
 }
 
-define void @test5(%C*, i1) {
+define void @test5(%C* %arg, i1 %arg1) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq %C* [[ARG:%.*]], null
+; CHECK-NEXT:    [[TMP2:%.*]] = or i1 [[TMP21]], [[ARG1:%.*]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[BB5:%.*]], label [[BB3:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    ret void
+; CHECK:       bb3:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[C:%.*]], %C* [[ARG]], i64 0, i32 0
+; CHECK-NEXT:    tail call void @bar(%struct.S* [[TMP4]])
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb5:
+; CHECK-NEXT:    tail call void @foobar()
+; CHECK-NEXT:    br label [[BB]]
+;
 entry:
-  %2 = select i1 %1, %C* null, %C* %0
-  %3 = icmp ne %C* %2, null
-  br i1 %3, label %5, label %7
+  %tmp = select i1 %arg1, %C* null, %C* %arg
+  %tmp2 = icmp ne %C* %tmp, null
+  br i1 %tmp2, label %bb3, label %bb5
 
-; <label>:4                                       ; preds = %10, %12
+bb:                                               ; preds = %bb5, %bb3
   ret void
 
-; <label>:5                                      ; preds = %entry
-  %6 = getelementptr inbounds %C, %C* %2, i64 0, i32 0
-  tail call void @bar(%struct.S* %6)
-  br label %4
+bb3:                                              ; preds = %entry
+  %tmp4 = getelementptr inbounds %C, %C* %tmp, i64 0, i32 0
+  tail call void @bar(%struct.S* %tmp4)
+  br label %bb
 
-; <label>:7                                      ; preds = %entry
+bb5:                                              ; preds = %entry
   tail call void @foobar()
-  br label %4
-; CHECK-LABEL: @test5(
-; CHECK-NOT: select
-; CHECK: or
-; CHECK-NOT: select
+  br label %bb
+}
+
+; Negative test. Must not trigger the select-cmp-br combine because the result
+; of the select is used in both flows following the br (the special case where
+; the conditional branch has the same target for both flows).
+define i32 @test6(i32 %arg, i1 %arg1) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[BB:%.*]], label [[BB]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[TMP:%.*]] = select i1 [[ARG1:%.*]], i32 [[ARG:%.*]], i32 0
+; CHECK-NEXT:    ret i32 [[TMP]]
+;
+entry:
+  %tmp = select i1 %arg1, i32 %arg, i32 0
+  %tmp2 = icmp eq i32 %tmp, 0
+  br i1 %tmp2, label %bb, label %bb
+
+bb:                                               ; preds = %entry, %entry
+  ret i32 %tmp
 }
diff --git a/test/Transforms/InstCombine/select.ll b/test/Transforms/InstCombine/select.ll
index f8c96e7f3f67..c26380eaa71b 100644
--- a/test/Transforms/InstCombine/select.ll
+++ b/test/Transforms/InstCombine/select.ll
@@ -190,7 +190,7 @@ define <2 x i1> @test62vec(<2 x i1> %A, <2 x i1> %B) {
 define i1 @test63(i1 %A, i1 %B) {
 ; CHECK-LABEL: @test63(
 ; CHECK-NEXT:    [[NOT:%.*]] = xor i1 %A, true
-; CHECK-NEXT:    [[C:%.*]] = or i1 %B, [[NOT]]
+; CHECK-NEXT:    [[C:%.*]] = or i1 [[NOT]], %B
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %not = xor i1 %A, true
@@ -201,7 +201,7 @@ define i1 @test63(i1 %A, i1 %B) {
 define <2 x i1> @test63vec(<2 x i1> %A, <2 x i1> %B) {
 ; CHECK-LABEL: @test63vec(
 ; CHECK-NEXT:    [[NOT:%.*]] = xor <2 x i1> %A, <i1 true, i1 true>
-; CHECK-NEXT:    [[C:%.*]] = or <2 x i1> %B, [[NOT]]
+; CHECK-NEXT:    [[C:%.*]] = or <2 x i1> [[NOT]], %B
 ; CHECK-NEXT:    ret <2 x i1> [[C]]
 ;
   %not = xor <2 x i1> %A, <i1 true, i1 true>
@@ -1264,11 +1264,10 @@ define i32 @PR23757(i32 %x) {
 define i32 @PR27137(i32 %a) {
 ; CHECK-LABEL: @PR27137(
 ; CHECK-NEXT:    [[NOT_A:%.*]] = xor i32 %a, -1
-; CHECK-NEXT:    [[C0:%.*]] = icmp slt i32 %a, 0
+; CHECK-NEXT:    [[C0:%.*]] = icmp sgt i32 [[NOT_A]], -1
 ; CHECK-NEXT:    [[S0:%.*]] = select i1 [[C0]], i32 [[NOT_A]], i32 -1
 ; CHECK-NEXT:    ret i32 [[S0]]
 ;
-
   %not_a = xor i32 %a, -1
   %c0 = icmp slt i32 %a, 0
   %s0 = select i1 %c0, i32 %not_a, i32 -1
@@ -1299,11 +1298,22 @@ define <2 x i32> @select_icmp_slt0_xor_vec(<2 x i32> %x) {
   ret <2 x i32> %x.xor
 }
 
-; Make sure that undef elements of the select condition are translated into undef elements of the shuffle mask.
-
 define <4 x i32> @canonicalize_to_shuffle(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @canonicalize_to_shuffle(
-; CHECK-NEXT:    [[SEL:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 6, i32 undef>
+; CHECK-NEXT:    [[SEL:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[SEL]]
+;
+  %sel = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %sel
+}
+
+; Undef elements of the select condition may not be translated into undef elements of a shuffle mask
+; because undef in a shuffle mask means we can return anything, not just one of the selected values.
+; https://bugs.llvm.org/show_bug.cgi?id=32486
+
+define <4 x i32> @undef_elts_in_condition(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @undef_elts_in_condition(
+; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> <i1 true, i1 undef, i1 false, i1 undef>, <4 x i32> %a, <4 x i32> %b
 ; CHECK-NEXT:    ret <4 x i32> [[SEL]]
 ;
   %sel = select <4 x i1> <i1 true, i1 undef, i1 false, i1 undef>, <4 x i32> %a, <4 x i32> %b
@@ -1332,3 +1342,29 @@ define <4 x i32> @cannot_canonicalize_to_shuffle2(<4 x i32> %a, <4 x i32> %b) {
   ret <4 x i32> %sel
 }
 
+declare void @llvm.assume(i1)
+
+define i8 @assume_cond_true(i1 %cond, i8 %x, i8 %y) {
+; CHECK-LABEL: @assume_cond_true(
+; CHECK-NEXT:    call void @llvm.assume(i1 %cond)
+; CHECK-NEXT:    ret i8 %x
+;
+  call void @llvm.assume(i1 %cond)
+  %sel = select i1 %cond, i8 %x, i8 %y
+  ret i8 %sel
+}
+
+; computeKnownBitsFromAssume() understands the 'not' of an assumed condition.
+
+define i8 @assume_cond_false(i1 %cond, i8 %x, i8 %y) {
+; CHECK-LABEL: @assume_cond_false(
+; CHECK-NEXT:    [[NOTCOND:%.*]] = xor i1 %cond, true
+; CHECK-NEXT:    call void @llvm.assume(i1 [[NOTCOND]])
+; CHECK-NEXT:    ret i8 %y
+;
+  %notcond = xor i1 %cond, true
+  call void @llvm.assume(i1 %notcond)
+  %sel = select i1 %cond, i8 %x, i8 %y
+  ret i8 %sel
+}
+
diff --git a/test/Transforms/InstCombine/select_meta.ll b/test/Transforms/InstCombine/select_meta.ll
index 82a85e5836dc..7d5771a0a81c 100644
--- a/test/Transforms/InstCombine/select_meta.ll
+++ b/test/Transforms/InstCombine/select_meta.ll
@@ -193,12 +193,11 @@ define i32 @test74(i32 %x) {
   ret i32 %retval
 }
 
-; FIXME:
 ; The compare should change, but the metadata remains the same because the select operands are not swapped.
 define i32 @smin1(i32 %x) {
 ; CHECK-LABEL: @smin1(
 ; CHECK-NEXT:    [[NOT_X:%.*]] = xor i32 %x, -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 %x, 0
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[NOT_X]], -1
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[NOT_X]], i32 -1, !prof ![[MD1]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
@@ -208,13 +207,12 @@ define i32 @smin1(i32 %x) {
   ret i32 %sel
 }
 
-; FIXME:
 ; The compare should change, and the metadata is swapped because the select operands are swapped.
 define i32 @smin2(i32 %x) {
 ; CHECK-LABEL: @smin2(
 ; CHECK-NEXT:    [[NOT_X:%.*]] = xor i32 %x, -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 %x, 0
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 -1, i32 [[NOT_X]], !prof ![[MD1]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[NOT_X]], -1
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[NOT_X]], i32 -1, !prof ![[MD3]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
   %not_x = xor i32 %x, -1
@@ -223,12 +221,11 @@ define i32 @smin2(i32 %x) {
   ret i32 %sel
 }
 
-; FIXME:
 ; The compare should change, but the metadata remains the same because the select operands are not swapped.
 define i32 @smax1(i32 %x) {
 ; CHECK-LABEL: @smax1(
 ; CHECK-NEXT:    [[NOT_X:%.*]] = xor i32 %x, -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 %x, 0
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[NOT_X]], -1
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[NOT_X]], i32 -1, !prof ![[MD1]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
@@ -238,13 +235,12 @@ define i32 @smax1(i32 %x) {
   ret i32 %sel
 }
 
-; FIXME:
 ; The compare should change, and the metadata is swapped because the select operands are swapped.
 define i32 @smax2(i32 %x) {
 ; CHECK-LABEL: @smax2(
 ; CHECK-NEXT:    [[NOT_X:%.*]] = xor i32 %x, -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 %x, 0
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 -1, i32 [[NOT_X]], !prof ![[MD1]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[NOT_X]], -1
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[NOT_X]], i32 -1, !prof ![[MD3]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
   %not_x = xor i32 %x, -1
@@ -253,11 +249,10 @@ define i32 @smax2(i32 %x) {
   ret i32 %sel
 }
 
-; FIXME:
 ; The compare should change, but the metadata remains the same because the select operands are not swapped.
 define i32 @umin1(i32 %x) {
 ; CHECK-LABEL: @umin1(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 %x, -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 %x, -2147483648
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 %x, i32 -2147483648, !prof ![[MD1]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
@@ -266,12 +261,11 @@ define i32 @umin1(i32 %x) {
   ret i32 %sel
 }
 
-; FIXME:
 ; The compare should change, and the metadata is swapped because the select operands are swapped.
 define i32 @umin2(i32 %x) {
 ; CHECK-LABEL: @umin2(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 %x, 0
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 2147483647, i32 %x, !prof ![[MD1]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 %x, 2147483647
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 %x, i32 2147483647, !prof ![[MD3]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
   %cmp = icmp slt i32 %x, 0
@@ -279,11 +273,10 @@ define i32 @umin2(i32 %x) {
   ret i32 %sel
 }
 
-; FIXME:
 ; The compare should change, but the metadata remains the same because the select operands are not swapped.
 define i32 @umax1(i32 %x) {
 ; CHECK-LABEL: @umax1(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 %x, 0
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 %x, 2147483647
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 %x, i32 2147483647, !prof ![[MD1]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
@@ -292,12 +285,11 @@ define i32 @umax1(i32 %x) {
   ret i32 %sel
 }
 
-; FIXME:
 ; The compare should change, and the metadata is swapped because the select operands are swapped.
 define i32 @umax2(i32 %x) {
 ; CHECK-LABEL: @umax2(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 %x, -1
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 -2147483648, i32 %x, !prof ![[MD1]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 %x, -2147483648
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 %x, i32 -2147483648, !prof ![[MD3]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
   %cmp = icmp sgt i32 %x, -1
diff --git a/test/Transforms/InstCombine/shift-sra.ll b/test/Transforms/InstCombine/shift-sra.ll
index 75235500d513..4483e60b506a 100644
--- a/test/Transforms/InstCombine/shift-sra.ll
+++ b/test/Transforms/InstCombine/shift-sra.ll
@@ -1,26 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
 
 define i32 @test1(i32 %X, i8 %A) {
-        %shift.upgrd.1 = zext i8 %A to i32              ; <i32> [#uses=1]
-        ; can be logical shift.
-        %Y = ashr i32 %X, %shift.upgrd.1                ; <i32> [#uses=1]
-        %Z = and i32 %Y, 1              ; <i32> [#uses=1]
-        ret i32 %Z
 ; CHECK-LABEL: @test1(
-; CHECK: lshr i32 %X, %shift.upgrd.1 
+; CHECK-NEXT:    [[SHIFT_UPGRD_1:%.*]] = zext i8 %A to i32
+; CHECK-NEXT:    [[Y1:%.*]] = lshr i32 %X, [[SHIFT_UPGRD_1]]
+; CHECK-NEXT:    [[Z:%.*]] = and i32 [[Y1]], 1
+; CHECK-NEXT:    ret i32 [[Z]]
+;
+  %shift.upgrd.1 = zext i8 %A to i32
+  ; can be logical shift.
+  %Y = ashr i32 %X, %shift.upgrd.1
+  %Z = and i32 %Y, 1
+  ret i32 %Z
 }
 
 define i32 @test2(i8 %tmp) {
-        %tmp3 = zext i8 %tmp to i32             ; <i32> [#uses=1]
-        %tmp4 = add i32 %tmp3, 7                ; <i32> [#uses=1]
-        %tmp5 = ashr i32 %tmp4, 3               ; <i32> [#uses=1]
-        ret i32 %tmp5
 ; CHECK-LABEL: @test2(
-; CHECK: lshr i32 %tmp4, 3
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 %tmp to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i32 [[TMP3]], 7
+; CHECK-NEXT:    [[TMP51:%.*]] = lshr i32 [[TMP4]], 3
+; CHECK-NEXT:    ret i32 [[TMP51]]
+;
+  %tmp3 = zext i8 %tmp to i32
+  %tmp4 = add i32 %tmp3, 7
+  %tmp5 = ashr i32 %tmp4, 3
+  ret i32 %tmp5
 }
 
 define i64 @test3(i1 %X, i64 %Y, i1 %Cond) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    br i1 %Cond, label %T, label %F
+; CHECK:       T:
+; CHECK-NEXT:    [[X2:%.*]] = sext i1 %X to i64
+; CHECK-NEXT:    br label %C
+; CHECK:       F:
+; CHECK-NEXT:    [[Y2:%.*]] = ashr i64 %Y, 63
+; CHECK-NEXT:    br label %C
+; CHECK:       C:
+; CHECK-NEXT:    [[P:%.*]] = phi i64 [ [[X2]], %T ], [ [[Y2]], %F ]
+; CHECK-NEXT:    ret i64 [[P]]
+;
   br i1 %Cond, label %T, label %F
 T:
   %X2 = sext i1 %X to i64
@@ -29,16 +50,24 @@ F:
   %Y2 = ashr i64 %Y, 63
   br label %C
 C:
-  %P = phi i64 [%X2, %T], [%Y2, %F] 
+  %P = phi i64 [%X2, %T], [%Y2, %F]
   %S = ashr i64 %P, 12
   ret i64 %S
-  
-; CHECK-LABEL: @test3(
-; CHECK: %P = phi i64
-; CHECK-NEXT: ret i64 %P
 }
 
 define i64 @test4(i1 %X, i64 %Y, i1 %Cond) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    br i1 %Cond, label %T, label %F
+; CHECK:       T:
+; CHECK-NEXT:    [[X2:%.*]] = sext i1 %X to i64
+; CHECK-NEXT:    br label %C
+; CHECK:       F:
+; CHECK-NEXT:    [[Y2:%.*]] = ashr i64 %Y, 63
+; CHECK-NEXT:    br label %C
+; CHECK:       C:
+; CHECK-NEXT:    [[P:%.*]] = phi i64 [ [[X2]], %T ], [ [[Y2]], %F ]
+; CHECK-NEXT:    ret i64 [[P]]
+;
   br i1 %Cond, label %T, label %F
 T:
   %X2 = sext i1 %X to i64
@@ -47,18 +76,29 @@ F:
   %Y2 = ashr i64 %Y, 63
   br label %C
 C:
-  %P = phi i64 [%X2, %T], [%Y2, %F] 
+  %P = phi i64 [%X2, %T], [%Y2, %F]
   %R = shl i64 %P, 12
   %S = ashr i64 %R, 12
   ret i64 %S
-  
-; CHECK-LABEL: @test4(
-; CHECK: %P = phi i64
-; CHECK-NEXT: ret i64 %P
 }
 
 ; rdar://7732987
 define i32 @test5(i32 %Y) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    br i1 undef, label %A, label %C
+; CHECK:       A:
+; CHECK-NEXT:    br i1 undef, label %B, label %D
+; CHECK:       B:
+; CHECK-NEXT:    br label %D
+; CHECK:       C:
+; CHECK-NEXT:    br i1 undef, label %D, label %E
+; CHECK:       D:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ 0, %A ], [ 0, %B ], [ %Y, %C ]
+; CHECK-NEXT:    [[S:%.*]] = ashr i32 [[P]], 16
+; CHECK-NEXT:    ret i32 [[S]]
+; CHECK:       E:
+; CHECK-NEXT:    ret i32 0
+;
   br i1 undef, label %A, label %C
 A:
   br i1 undef, label %B, label %D
@@ -67,12 +107,59 @@ B:
 C:
   br i1 undef, label %D, label %E
 D:
-  %P = phi i32 [0, %A], [0, %B], [%Y, %C] 
+  %P = phi i32 [0, %A], [0, %B], [%Y, %C]
   %S = ashr i32 %P, 16
   ret i32 %S
-; CHECK-LABEL: @test5(
-; CHECK: %P = phi i32
-; CHECK-NEXT: ashr i32 %P, 16
 E:
   ret i32 0
 }
+
+; (X >>s C1) >>s C2 --> X >>s (C1 + C2)
+
+define i32 @ashr_ashr(i32 %x) {
+; CHECK-LABEL: @ashr_ashr(
+; CHECK-NEXT:    [[SH2:%.*]] = ashr i32 %x, 12
+; CHECK-NEXT:    ret i32 [[SH2]]
+;
+  %sh1 = ashr i32 %x, 5
+  %sh2 = ashr i32 %sh1, 7
+  ret i32 %sh2
+}
+
+; PR3851
+; (X >>s C1) >>s C2 --> X >>s (Bitwidth - 1)
+
+define i32 @ashr_overshift(i32 %x) {
+; CHECK-LABEL: @ashr_overshift(
+; CHECK-NEXT:    [[SH2:%.*]] = ashr i32 %x, 31
+; CHECK-NEXT:    ret i32 [[SH2]]
+;
+  %sh1 = ashr i32 %x, 15
+  %sh2 = ashr i32 %sh1, 17
+  ret i32 %sh2
+}
+
+; (X >>s C1) >>s C2 --> X >>s (C1 + C2)
+
+define <2 x i32> @ashr_ashr_splat_vec(<2 x i32> %x) {
+; CHECK-LABEL: @ashr_ashr_splat_vec(
+; CHECK-NEXT:    [[SH2:%.*]] = ashr <2 x i32> %x, <i32 12, i32 12>
+; CHECK-NEXT:    ret <2 x i32> [[SH2]]
+;
+  %sh1 = ashr <2 x i32> %x, <i32 5, i32 5>
+  %sh2 = ashr <2 x i32> %sh1, <i32 7, i32 7>
+  ret <2 x i32> %sh2
+}
+
+; (X >>s C1) >>s C2 --> X >>s (Bitwidth - 1)
+
+define <2 x i32> @ashr_overshift_splat_vec(<2 x i32> %x) {
+; CHECK-LABEL: @ashr_overshift_splat_vec(
+; CHECK-NEXT:    [[SH2:%.*]] = ashr <2 x i32> %x, <i32 31, i32 31>
+; CHECK-NEXT:    ret <2 x i32> [[SH2]]
+;
+  %sh1 = ashr <2 x i32> %x, <i32 15, i32 15>
+  %sh2 = ashr <2 x i32> %sh1, <i32 17, i32 17>
+  ret <2 x i32> %sh2
+}
+
diff --git a/test/Transforms/InstCombine/shift.ll b/test/Transforms/InstCombine/shift.ll
index c046a72110c2..60ba35557f70 100644
--- a/test/Transforms/InstCombine/shift.ll
+++ b/test/Transforms/InstCombine/shift.ll
@@ -1,6 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; This test makes sure that these instructions are properly eliminated.
-;
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
 define i32 @test1(i32 %A) {
@@ -161,9 +159,8 @@ define i8 @test9(i8 %A) {
   ret i8 %C
 }
 
-;; This transformation is deferred to DAGCombine:
 ;; (A >> 7) << 7 === A & 128
-;; The shl may be valuable to scalar evolution.
+
 define i8 @test10(i8 %A) {
 ; CHECK-LABEL: @test10(
 ; CHECK-NEXT:    [[B:%.*]] = and i8 %A, -128
@@ -454,9 +451,8 @@ define i32 @test25(i32 %tmp.2, i32 %AA) {
 
 define <2 x i32> @test25_vector(<2 x i32> %tmp.2, <2 x i32> %AA) {
 ; CHECK-LABEL: @test25_vector(
-; CHECK-NEXT:    [[TMP_3:%.*]] = lshr <2 x i32> %tmp.2, <i32 17, i32 17>
-; CHECK-NEXT:    [[TMP_51:%.*]] = shl <2 x i32> [[TMP_3]], <i32 17, i32 17>
-; CHECK-NEXT:    [[X2:%.*]] = add <2 x i32> [[TMP_51]], %AA
+; CHECK-NEXT:    [[TMP_3:%.*]] = and <2 x i32> %tmp.2, <i32 -131072, i32 -131072>
+; CHECK-NEXT:    [[X2:%.*]] = add <2 x i32> [[TMP_3]], %AA
 ; CHECK-NEXT:    [[TMP_6:%.*]] = and <2 x i32> [[X2]], <i32 -131072, i32 -131072>
 ; CHECK-NEXT:    ret <2 x i32> [[TMP_6]]
 ;
@@ -640,30 +636,25 @@ define <2 x i1> @test35vec(<2 x i32> %X) {
 
 define i128 @test36(i128 %A, i128 %B) {
 ; CHECK-LABEL: @test36(
-; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP231:%.*]] = or i128 %B, %A
 ; CHECK-NEXT:    [[INS:%.*]] = and i128 [[TMP231]], 18446744073709551615
 ; CHECK-NEXT:    ret i128 [[INS]]
 ;
-entry:
   %tmp27 = shl i128 %A, 64
   %tmp23 = shl i128 %B, 64
   %ins = or i128 %tmp23, %tmp27
   %tmp45 = lshr i128 %ins, 64
   ret i128 %tmp45
-
 }
 
 define i64 @test37(i128 %A, i32 %B) {
 ; CHECK-LABEL: @test37(
-; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 %B to i128
 ; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw nsw i128 [[TMP22]], 32
 ; CHECK-NEXT:    [[INS:%.*]] = or i128 [[TMP23]], %A
 ; CHECK-NEXT:    [[TMP46:%.*]] = trunc i128 [[INS]] to i64
 ; CHECK-NEXT:    ret i64 [[TMP46]]
 ;
-entry:
   %tmp27 = shl i128 %A, 64
   %tmp22 = zext i32 %B to i128
   %tmp23 = shl i128 %tmp22, 96
@@ -671,7 +662,17 @@ entry:
   %tmp45 = lshr i128 %ins, 64
   %tmp46 = trunc i128 %tmp45 to i64
   ret i64 %tmp46
+}
 
+define <2 x i32> @shl_nuw_nsw_splat_vec(<2 x i8> %x) {
+; CHECK-LABEL: @shl_nuw_nsw_splat_vec(
+; CHECK-NEXT:    [[T2:%.*]] = zext <2 x i8> %x to <2 x i32>
+; CHECK-NEXT:    [[T3:%.*]] = shl nuw nsw <2 x i32> [[T2]], <i32 17, i32 17>
+; CHECK-NEXT:    ret <2 x i32> [[T3]]
+;
+  %t2 = zext <2 x i8> %x to <2 x i32>
+  %t3 = shl <2 x i32> %t2, <i32 17, i32 17>
+  ret <2 x i32> %t3
 }
 
 define i32 @test38(i32 %x) nounwind readnone {
@@ -789,6 +790,8 @@ define i32 @test45(i32 %a) nounwind {
   ret i32 %z
 }
 
+; (X >>?exact C1) << C2 --> X >>?exact (C1-C2)
+
 define i32 @test46(i32 %a) {
 ; CHECK-LABEL: @test46(
 ; CHECK-NEXT:    [[Z:%.*]] = ashr exact i32 %a, 2
@@ -799,16 +802,44 @@ define i32 @test46(i32 %a) {
   ret i32 %z
 }
 
-define i32 @test47(i32 %a) {
+; (X >>?exact C1) << C2 --> X >>?exact (C1-C2)
+
+define <2 x i32> @test46_splat_vec(<2 x i32> %a) {
+; CHECK-LABEL: @test46_splat_vec(
+; CHECK-NEXT:    [[Z:%.*]] = ashr exact <2 x i32> %a, <i32 2, i32 2>
+; CHECK-NEXT:    ret <2 x i32> [[Z]]
+;
+  %y = ashr exact <2 x i32> %a, <i32 3, i32 3>
+  %z = shl <2 x i32> %y, <i32 1, i32 1>
+  ret <2 x i32> %z
+}
+
+; (X >>?exact C1) << C2 --> X >>?exact (C1-C2)
+
+define i8 @test47(i8 %a) {
 ; CHECK-LABEL: @test47(
-; CHECK-NEXT:    [[Z:%.*]] = lshr exact i32 %a, 2
-; CHECK-NEXT:    ret i32 [[Z]]
+; CHECK-NEXT:    [[Z:%.*]] = lshr exact i8 %a, 2
+; CHECK-NEXT:    ret i8 [[Z]]
 ;
-  %y = lshr exact i32 %a, 3
-  %z = shl i32 %y, 1
-  ret i32 %z
+  %y = lshr exact i8 %a, 3
+  %z = shl i8 %y, 1
+  ret i8 %z
+}
+
+; (X >>?exact C1) << C2 --> X >>?exact (C1-C2)
+
+define <2 x i8> @test47_splat_vec(<2 x i8> %a) {
+; CHECK-LABEL: @test47_splat_vec(
+; CHECK-NEXT:    [[Z:%.*]] = lshr exact <2 x i8> %a, <i8 2, i8 2>
+; CHECK-NEXT:    ret <2 x i8> [[Z]]
+;
+  %y = lshr exact <2 x i8> %a, <i8 3, i8 3>
+  %z = shl <2 x i8> %y, <i8 1, i8 1>
+  ret <2 x i8> %z
 }
 
+; (X >>u,exact C1) << C2 --> X << (C2-C1) when C2 > C1
+
 define i32 @test48(i32 %x) {
 ; CHECK-LABEL: @test48(
 ; CHECK-NEXT:    [[B:%.*]] = shl i32 %x, 2
@@ -819,6 +850,32 @@ define i32 @test48(i32 %x) {
   ret i32 %B
 }
 
+; Verify that wrap flags are preserved from the original 'shl'.
+
+define i32 @test48_nuw_nsw(i32 %x) {
+; CHECK-LABEL: @test48_nuw_nsw(
+; CHECK-NEXT:    [[B:%.*]] = shl nuw nsw i32 %x, 2
+; CHECK-NEXT:    ret i32 [[B]]
+;
+  %A = lshr exact i32 %x, 1
+  %B = shl nuw nsw i32 %A, 3
+  ret i32 %B
+}
+
+; (X >>u,exact C1) << C2 --> X << (C2-C1) when splatted C2 > C1
+
+define <2 x i32> @test48_splat_vec(<2 x i32> %x) {
+; CHECK-LABEL: @test48_splat_vec(
+; CHECK-NEXT:    [[B:%.*]] = shl nuw nsw <2 x i32> %x, <i32 2, i32 2>
+; CHECK-NEXT:    ret <2 x i32> [[B]]
+;
+  %A = lshr exact <2 x i32> %x, <i32 1, i32 1>
+  %B = shl nsw nuw <2 x i32> %A, <i32 3, i32 3>
+  ret <2 x i32> %B
+}
+
+; (X >>s,exact C1) << C2 --> X << (C2-C1) when C2 > C1
+
 define i32 @test49(i32 %x) {
 ; CHECK-LABEL: @test49(
 ; CHECK-NEXT:    [[B:%.*]] = shl i32 %x, 2
@@ -829,6 +886,32 @@ define i32 @test49(i32 %x) {
   ret i32 %B
 }
 
+; Verify that wrap flags are preserved from the original 'shl'.
+
+define i32 @test49_nuw_nsw(i32 %x) {
+; CHECK-LABEL: @test49_nuw_nsw(
+; CHECK-NEXT:    [[B:%.*]] = shl nuw nsw i32 %x, 2
+; CHECK-NEXT:    ret i32 [[B]]
+;
+  %A = ashr exact i32 %x, 1
+  %B = shl nuw nsw i32 %A, 3
+  ret i32 %B
+}
+
+; (X >>s,exact C1) << C2 --> X << (C2-C1) when splatted C2 > C1
+
+define <2 x i32> @test49_splat_vec(<2 x i32> %x) {
+; CHECK-LABEL: @test49_splat_vec(
+; CHECK-NEXT:    [[B:%.*]] = shl nuw nsw <2 x i32> %x, <i32 2, i32 2>
+; CHECK-NEXT:    ret <2 x i32> [[B]]
+;
+  %A = ashr exact <2 x i32> %x, <i32 1, i32 1>
+  %B = shl nsw nuw <2 x i32> %A, <i32 3, i32 3>
+  ret <2 x i32> %B
+}
+
+; (X <<nsw C1) >>s C2 --> X >>s (C2-C1)
+
 define i32 @test50(i32 %x) {
 ; CHECK-LABEL: @test50(
 ; CHECK-NEXT:    [[B:%.*]] = ashr i32 %x, 2
@@ -839,6 +922,21 @@ define i32 @test50(i32 %x) {
   ret i32 %B
 }
 
+; (X <<nsw C1) >>s C2 --> X >>s (C2-C1)
+; Also, check that exact is propagated.
+
+define <2 x i32> @test50_splat_vec(<2 x i32> %x) {
+; CHECK-LABEL: @test50_splat_vec(
+; CHECK-NEXT:    [[B:%.*]] = ashr exact <2 x i32> %x, <i32 2, i32 2>
+; CHECK-NEXT:    ret <2 x i32> [[B]]
+;
+  %A = shl nsw <2 x i32> %x, <i32 1, i32 1>
+  %B = ashr exact <2 x i32> %A, <i32 3, i32 3>
+  ret <2 x i32> %B
+}
+
+; (X <<nuw C1) >>u C2 --> X >>u (C2-C1)
+
 define i32 @test51(i32 %x) {
 ; CHECK-LABEL: @test51(
 ; CHECK-NEXT:    [[B:%.*]] = lshr i32 %x, 2
@@ -849,6 +947,48 @@ define i32 @test51(i32 %x) {
   ret i32 %B
 }
 
+; (X <<nuw C1) >>u C2 --> X >>u (C2-C1) with splats
+; Also, check that exact is propagated.
+
+define <2 x i32> @test51_splat_vec(<2 x i32> %x) {
+; CHECK-LABEL: @test51_splat_vec(
+; CHECK-NEXT:    [[B:%.*]] = lshr exact <2 x i32> %x, <i32 2, i32 2>
+; CHECK-NEXT:    ret <2 x i32> [[B]]
+;
+  %A = shl nuw <2 x i32> %x, <i32 1, i32 1>
+  %B = lshr exact <2 x i32> %A, <i32 3, i32 3>
+  ret <2 x i32> %B
+}
+
+; (X << C1) >>u C2  --> X >>u (C2-C1) & (-1 >> C2)
+; Also, check that exact is propagated.
+
+define i32 @test51_no_nuw(i32 %x) {
+; CHECK-LABEL: @test51_no_nuw(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i32 %x, 2
+; CHECK-NEXT:    [[B:%.*]] = and i32 [[TMP1]], 536870911
+; CHECK-NEXT:    ret i32 [[B]]
+;
+  %A = shl i32 %x, 1
+  %B = lshr exact i32 %A, 3
+  ret i32 %B
+}
+
+; (X << C1) >>u C2  --> X >>u (C2-C1) & (-1 >> C2)
+
+define <2 x i32> @test51_no_nuw_splat_vec(<2 x i32> %x) {
+; CHECK-LABEL: @test51_no_nuw_splat_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <2 x i32> %x, <i32 2, i32 2>
+; CHECK-NEXT:    [[B:%.*]] = and <2 x i32> [[TMP1]], <i32 536870911, i32 536870911>
+; CHECK-NEXT:    ret <2 x i32> [[B]]
+;
+  %A = shl <2 x i32> %x, <i32 1, i32 1>
+  %B = lshr <2 x i32> %A, <i32 3, i32 3>
+  ret <2 x i32> %B
+}
+
+; (X <<nsw C1) >>s C2 --> X <<nsw (C1 - C2)
+
 define i32 @test52(i32 %x) {
 ; CHECK-LABEL: @test52(
 ; CHECK-NEXT:    [[B:%.*]] = shl nsw i32 %x, 2
@@ -859,6 +999,20 @@ define i32 @test52(i32 %x) {
   ret i32 %B
 }
 
+; (X <<nsw C1) >>s C2 --> X <<nsw (C1 - C2)
+
+define <2 x i32> @test52_splat_vec(<2 x i32> %x) {
+; CHECK-LABEL: @test52_splat_vec(
+; CHECK-NEXT:    [[B:%.*]] = shl nsw <2 x i32> %x, <i32 2, i32 2>
+; CHECK-NEXT:    ret <2 x i32> [[B]]
+;
+  %A = shl nsw <2 x i32> %x, <i32 3, i32 3>
+  %B = ashr <2 x i32> %A, <i32 1, i32 1>
+  ret <2 x i32> %B
+}
+
+; (X <<nuw C1) >>u C2 --> X <<nuw (C1 - C2)
+
 define i32 @test53(i32 %x) {
 ; CHECK-LABEL: @test53(
 ; CHECK-NEXT:    [[B:%.*]] = shl nuw i32 %x, 2
@@ -869,6 +1023,45 @@ define i32 @test53(i32 %x) {
   ret i32 %B
 }
 
+; (X <<nuw C1) >>u C2 --> X <<nuw (C1 - C2)
+
+define <2 x i32> @test53_splat_vec(<2 x i32> %x) {
+; CHECK-LABEL: @test53_splat_vec(
+; CHECK-NEXT:    [[B:%.*]] = shl nuw <2 x i32> %x, <i32 2, i32 2>
+; CHECK-NEXT:    ret <2 x i32> [[B]]
+;
+  %A = shl nuw <2 x i32> %x, <i32 3, i32 3>
+  %B = lshr <2 x i32> %A, <i32 1, i32 1>
+  ret <2 x i32> %B
+}
+
+; (X << C1) >>u C2  --> X << (C1 - C2) & (-1 >> C2)
+
+define i8 @test53_no_nuw(i8 %x) {
+; CHECK-LABEL: @test53_no_nuw(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i8 %x, 2
+; CHECK-NEXT:    [[B:%.*]] = and i8 [[TMP1]], 124
+; CHECK-NEXT:    ret i8 [[B]]
+;
+  %A = shl i8 %x, 3
+  %B = lshr i8 %A, 1
+  ret i8 %B
+}
+
+; (X << C1) >>u C2  --> X << (C1 - C2) & (-1 >> C2)
+; FIXME: Demanded bits should change the mask constant as it does for the scalar case.
+
+define <2 x i8> @test53_no_nuw_splat_vec(<2 x i8> %x) {
+; CHECK-LABEL: @test53_no_nuw_splat_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i8> %x, <i8 2, i8 2>
+; CHECK-NEXT:    [[B:%.*]] = and <2 x i8> [[TMP1]], <i8 127, i8 127>
+; CHECK-NEXT:    ret <2 x i8> [[B]]
+;
+  %A = shl <2 x i8> %x, <i8 3, i8 3>
+  %B = lshr <2 x i8> %A, <i8 1, i8 1>
+  ret <2 x i8> %B
+}
+
 define i32 @test54(i32 %x) {
 ; CHECK-LABEL: @test54(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 %x, 3
@@ -1041,7 +1234,7 @@ define <2 x i65> @test_63(<2 x i64> %t) {
 ; CHECK-LABEL: @test_63(
 ; CHECK-NEXT:    [[A:%.*]] = zext <2 x i64> %t to <2 x i65>
 ; CHECK-NEXT:    [[SEXT:%.*]] = shl <2 x i65> [[A]], <i65 33, i65 33>
-; CHECK-NEXT:    [[B:%.*]] = ashr <2 x i65> [[SEXT]], <i65 33, i65 33>
+; CHECK-NEXT:    [[B:%.*]] = ashr exact <2 x i65> [[SEXT]], <i65 33, i65 33>
 ; CHECK-NEXT:    ret <2 x i65> [[B]]
 ;
   %a = zext <2 x i64> %t to <2 x i65>
@@ -1052,12 +1245,26 @@ define <2 x i65> @test_63(<2 x i64> %t) {
 
 define i64 @test_64(i32 %t) {
 ; CHECK-LABEL: @test_64(
-; CHECK-NEXT: [[SHL:%.*]] = shl i32 %t, 8
-; CHECK-NEXT: [[EXT:%.*]] = zext i32 [[SHL]] to i64
-; CHECK-NEXT: ret i64 [[EXT]]
-
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 %t, 8
+; CHECK-NEXT:    [[SHL:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    ret i64 [[SHL]]
+;
   %and = and i32 %t, 16777215
   %ext = zext i32 %and to i64
   %shl = shl i64 %ext, 8
   ret i64 %shl
 }
+
+define <2 x i64> @test_64_splat_vec(<2 x i32> %t) {
+; CHECK-LABEL: @test_64_splat_vec(
+; CHECK-NEXT:    [[AND:%.*]] = and <2 x i32> %t, <i32 16777215, i32 16777215>
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw <2 x i32> [[AND]], <i32 8, i32 8>
+; CHECK-NEXT:    [[SHL:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[SHL]]
+;
+  %and = and <2 x i32> %t, <i32 16777215, i32 16777215>
+  %ext = zext <2 x i32> %and to <2 x i64>
+  %shl = shl <2 x i64> %ext, <i64 8, i64 8>
+  ret <2 x i64> %shl
+}
+
diff --git a/test/Transforms/InstCombine/shufflevec-bitcast.ll b/test/Transforms/InstCombine/shufflevec-bitcast.ll
new file mode 100644
index 000000000000..0f0365a07fb4
--- /dev/null
+++ b/test/Transforms/InstCombine/shufflevec-bitcast.ll
@@ -0,0 +1,16 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define void @test(<16 x i8> %w, i32* %o1, float* %o2) {
+
+; CHECK:       %v.bc = bitcast <16 x i8> %w to <4 x i32>
+; CHECK-NEXT:  %v.extract = extractelement <4 x i32> %v.bc, i32 3
+; CHECK-NEXT:  %v.bc{{[0-9]*}} = bitcast <16 x i8> %w to <4 x float>
+; CHECK-NEXT:  %v.extract{{[0-9]*}} = extractelement <4 x float> %v.bc{{[0-9]*}}, i32 3
+
+  %v = shufflevector <16 x i8> %w, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %f = bitcast <4 x i8> %v to float
+  %i = bitcast <4 x i8> %v to i32
+  store i32 %i, i32* %o1, align 4
+  store float %f, float* %o2, align 4
+  ret void
+}
diff --git a/test/Transforms/InstCombine/signext.ll b/test/Transforms/InstCombine/signext.ll
index bccadeb396f2..ff92ec0a8e3c 100644
--- a/test/Transforms/InstCombine/signext.ll
+++ b/test/Transforms/InstCombine/signext.ll
@@ -61,6 +61,10 @@ define i32 @test5(i32 %x) {
   ret i32 %tmp.4
 }
 
+;  If the shift amount equals the difference in width of the destination
+;  and source scalar types:
+;  ashr (shl (zext X), C), C --> sext X
+
 define i32 @test6(i16 %P) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:    [[TMP_5:%.*]] = sext i16 %P to i32
@@ -72,6 +76,19 @@ define i32 @test6(i16 %P) {
   ret i32 %tmp.5
 }
 
+; Vectors should get the same fold as above.
+
+define <2 x i32> @test6_splat_vec(<2 x i12> %P) {
+; CHECK-LABEL: @test6_splat_vec(
+; CHECK-NEXT:    [[ASHR:%.*]] = sext <2 x i12> %P to <2 x i32>
+; CHECK-NEXT:    ret <2 x i32> [[ASHR]]
+;
+  %z = zext <2 x i12> %P to <2 x i32>
+  %shl = shl <2 x i32> %z, <i32 20, i32 20>
+  %ashr = ashr <2 x i32> %shl, <i32 20, i32 20>
+  ret <2 x i32> %ashr
+}
+
 define i32 @test7(i32 %x) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:    [[SUB:%.*]] = ashr i32 %x, 5
diff --git a/test/Transforms/InstCombine/sitofp.ll b/test/Transforms/InstCombine/sitofp.ll
index 820977838836..149154723b95 100644
--- a/test/Transforms/InstCombine/sitofp.ll
+++ b/test/Transforms/InstCombine/sitofp.ll
@@ -1,41 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-; CHECK-LABEL: test1
-; CHECK: ret i1 true
 define i1 @test1(i8 %A) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    ret i1 true
+;
   %B = sitofp i8 %A to double
   %C = fcmp ult double %B, 128.0
   ret i1 %C
 }
 
-; CHECK-LABEL: test2
-; CHECK: ret i1 true
 define i1 @test2(i8 %A) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    ret i1 true
+;
   %B = sitofp i8 %A to double
   %C = fcmp ugt double %B, -128.1
   ret i1 %C
 }
 
-; CHECK-LABEL: test3
-; CHECK: ret i1 true
 define i1 @test3(i8 %A) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    ret i1 true
+;
   %B = sitofp i8 %A to double
   %C = fcmp ule double %B, 127.0
   ret i1 %C
 }
 
-; CHECK-LABEL: test4
-; CHECK: icmp ne i8 %A, 127
-; CHECK-NEXT: ret i1
 define i1 @test4(i8 %A) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 [[A:%.*]], 127
+; CHECK-NEXT:    ret i1 [[C]]
+;
   %B = sitofp i8 %A to double
   %C = fcmp ult double %B, 127.0
   ret i1 %C
 }
 
-; CHECK-LABEL: test5
-; CHECK: ret i32
 define i32 @test5(i32 %A) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    ret i32 [[A:%.*]]
+;
   %B = sitofp i32 %A to double
   %C = fptosi double %B to i32
   %D = uitofp i32 %C to double
@@ -43,10 +49,11 @@ define i32 @test5(i32 %A) {
   ret i32 %E
 }
 
-; CHECK-LABEL: test6
-; CHECK: and i32 %A, 39
-; CHECK-NEXT: ret i32
 define i32 @test6(i32 %A) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    [[ADDCONV:%.*]] = and i32 [[A:%.*]], 39
+; CHECK-NEXT:    ret i32 [[ADDCONV]]
+;
   %B = and i32 %A, 7
   %C = and i32 %A, 32
   %D = sitofp i32 %B to double
@@ -56,35 +63,39 @@ define i32 @test6(i32 %A) {
   ret i32 %G
 }
 
-; CHECK-LABEL: test7
-; CHECK: ret i32
-define i32 @test7(i32 %A) nounwind {
+define i32 @test7(i32 %A) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    ret i32 [[A:%.*]]
+;
   %B = sitofp i32 %A to double
   %C = fptoui double %B to i32
   ret i32 %C
 }
 
-; CHECK-LABEL: test8
-; CHECK: ret i32
-define i32 @test8(i32 %A) nounwind {
+define i32 @test8(i32 %A) {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    ret i32 [[A:%.*]]
+;
   %B = uitofp i32 %A to double
   %C = fptosi double %B to i32
   ret i32 %C
 }
 
-; CHECK-LABEL: test9
-; CHECK: zext i8
-; CHECK-NEXT: ret i32
-define i32 @test9(i8 %A) nounwind {
+define i32 @test9(i8 %A) {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    [[C:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %B = sitofp i8 %A to float
   %C = fptoui float %B to i32
   ret i32 %C
 }
 
-; CHECK-LABEL: test10
-; CHECK: sext i8
-; CHECK-NEXT: ret i32
-define i32 @test10(i8 %A) nounwind {
+define i32 @test10(i8 %A) {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    [[C:%.*]] = sext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %B = sitofp i8 %A to float
   %C = fptosi float %B to i32
   ret i32 %C
@@ -92,10 +103,12 @@ define i32 @test10(i8 %A) nounwind {
 
 ; If the input value is outside of the range of the output cast, it's
 ; undefined behavior, so we can assume it fits.
-; CHECK-LABEL: test11
-; CHECK: trunc
-; CHECK-NEXT: ret i8
-define i8 @test11(i32 %A) nounwind {
+
+define i8 @test11(i32 %A) {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[C:%.*]] = trunc i32 [[A:%.*]] to i8
+; CHECK-NEXT:    ret i8 [[C]]
+;
   %B = sitofp i32 %A to float
   %C = fptosi float %B to i8
   ret i8 %C
@@ -103,82 +116,103 @@ define i8 @test11(i32 %A) nounwind {
 
 ; If the input value is negative, it'll be outside the range of the
 ; output cast, and thus undefined behavior.
-; CHECK-LABEL: test12
-; CHECK: zext i8
-; CHECK-NEXT: ret i32
-define i32 @test12(i8 %A) nounwind {
+
+define i32 @test12(i8 %A) {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:    [[C:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %B = sitofp i8 %A to float
   %C = fptoui float %B to i32
   ret i32 %C
 }
 
 ; This can't fold because the 25-bit input doesn't fit in the mantissa.
-; CHECK-LABEL: test13
-; CHECK: uitofp
-; CHECK-NEXT: fptoui
-define i32 @test13(i25 %A) nounwind {
+
+define i32 @test13(i25 %A) {
+; CHECK-LABEL: @test13(
+; CHECK-NEXT:    [[B:%.*]] = uitofp i25 [[A:%.*]] to float
+; CHECK-NEXT:    [[C:%.*]] = fptoui float [[B]] to i32
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %B = uitofp i25 %A to float
   %C = fptoui float %B to i32
   ret i32 %C
 }
 
 ; But this one can.
-; CHECK-LABEL: test14
-; CHECK: zext i24
-; CHECK-NEXT: ret i32
-define i32 @test14(i24 %A) nounwind {
+
+define i32 @test14(i24 %A) {
+; CHECK-LABEL: @test14(
+; CHECK-NEXT:    [[C:%.*]] = zext i24 [[A:%.*]] to i32
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %B = uitofp i24 %A to float
   %C = fptoui float %B to i32
   ret i32 %C
 }
 
 ; And this one can too.
-; CHECK-LABEL: test15
-; CHECK: trunc i32
-; CHECK-NEXT: ret i24
-define i24 @test15(i32 %A) nounwind {
+
+define i24 @test15(i32 %A) {
+; CHECK-LABEL: @test15(
+; CHECK-NEXT:    [[C:%.*]] = trunc i32 [[A:%.*]] to i24
+; CHECK-NEXT:    ret i24 [[C]]
+;
   %B = uitofp i32 %A to float
   %C = fptoui float %B to i24
   ret i24 %C
 }
 
-; This can fold because the 25-bit input is signed and we disard the sign bit.
-; CHECK-LABEL: test16
-; CHECK: zext
-define i32 @test16(i25 %A) nounwind {
- %B = sitofp i25 %A to float
- %C = fptoui float %B to i32
- ret i32 %C
+; This can fold because the 25-bit input is signed and we discard the sign bit.
+
+define i32 @test16(i25 %A) {
+; CHECK-LABEL: @test16(
+; CHECK-NEXT:    [[C:%.*]] = zext i25 [[A:%.*]] to i32
+; CHECK-NEXT:    ret i32 [[C]]
+;
+  %B = sitofp i25 %A to float
+  %C = fptoui float %B to i32
+  ret i32 %C
 }
 
 ; This can't fold because the 26-bit input won't fit the mantissa
-; even after disarding the signed bit.
-; CHECK-LABEL: test17
-; CHECK: sitofp
-; CHECK-NEXT: fptoui
-define i32 @test17(i26 %A) nounwind {
- %B = sitofp i26 %A to float
- %C = fptoui float %B to i32
- ret i32 %C
+; even after discarding the signed bit.
+
+define i32 @test17(i26 %A) {
+; CHECK-LABEL: @test17(
+; CHECK-NEXT:    [[B:%.*]] = sitofp i26 [[A:%.*]] to float
+; CHECK-NEXT:    [[C:%.*]] = fptoui float [[B]] to i32
+; CHECK-NEXT:    ret i32 [[C]]
+;
+  %B = sitofp i26 %A to float
+  %C = fptoui float %B to i32
+  ret i32 %C
 }
 
-; This can fold because the 54-bit output is signed and we disard the sign bit.
-; CHECK-LABEL: test18
-; CHECK: trunc
-define i54 @test18(i64 %A) nounwind {
- %B = sitofp i64 %A to double
- %C = fptosi double %B to i54
- ret i54 %C
+; This can fold because the 54-bit output is signed and we discard the sign bit.
+
+define i54 @test18(i64 %A) {
+; CHECK-LABEL: @test18(
+; CHECK-NEXT:    [[C:%.*]] = trunc i64 [[A:%.*]] to i54
+; CHECK-NEXT:    ret i54 [[C]]
+;
+  %B = sitofp i64 %A to double
+  %C = fptosi double %B to i54
+  ret i54 %C
 }
 
 ; This can't fold because the 55-bit output won't fit the mantissa
-; even after disarding the sign bit.
-; CHECK-LABEL: test19
-; CHECK: sitofp
-; CHECK-NEXT: fptosi
-define i55 @test19(i64 %A) nounwind {
- %B = sitofp i64 %A to double
- %C = fptosi double %B to i55
- ret i55 %C
+; even after discarding the sign bit.
+
+define i55 @test19(i64 %A) {
+; CHECK-LABEL: @test19(
+; CHECK-NEXT:    [[B:%.*]] = sitofp i64 [[A:%.*]] to double
+; CHECK-NEXT:    [[C:%.*]] = fptosi double [[B]] to i55
+; CHECK-NEXT:    ret i55 [[C]]
+;
+  %B = sitofp i64 %A to double
+  %C = fptosi double %B to i55
+  ret i55 %C
 }
 
diff --git a/test/Transforms/InstCombine/srem.ll b/test/Transforms/InstCombine/srem.ll
deleted file mode 100644
index beefe4fb8d3f..000000000000
--- a/test/Transforms/InstCombine/srem.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: opt < %s -instcombine -S | grep srem
-
-define i64 @foo(i64 %x1, i64 %y2) {
-	%r = sdiv i64 %x1, %y2
-	%r7 = mul i64 %r, %y2
-	%r8 = sub i64 %x1, %r7
-	ret i64 %r8
-}
diff --git a/test/Transforms/InstCombine/stpcpy_chk-1.ll b/test/Transforms/InstCombine/stpcpy_chk-1.ll
index 2fcc34b05227..45e6879c8d26 100644
--- a/test/Transforms/InstCombine/stpcpy_chk-1.ll
+++ b/test/Transforms/InstCombine/stpcpy_chk-1.ll
@@ -64,10 +64,10 @@ define i8* @test_simplify5() {
   %dst = getelementptr inbounds [60 x i8], [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8], [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: %len = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false)
+; CHECK-NEXT: %len = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false, i1 false)
 ; CHECK-NEXT: %1 = call i8* @__memcpy_chk(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str, i32 0, i32 0), i32 12, i32 %len)
 ; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 11)
-  %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false)
+  %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false, i1 false)
   %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 %len)
   ret i8* %ret
 }
@@ -81,7 +81,7 @@ define i8* @test_simplify6() {
 ; CHECK-NEXT: %strlen = call i32 @strlen(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0))
 ; CHECK-NEXT: %1 = getelementptr inbounds [60 x i8], [60 x i8]* @a, i32 0, i32 %strlen
 ; CHECK-NEXT: ret i8* %1
-  %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false)
+  %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false, i1 false)
   %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %dst, i32 %len)
   ret i8* %ret
 }
@@ -100,4 +100,4 @@ define i8* @test_no_simplify1() {
 }
 
 declare i8* @__stpcpy_chk(i8*, i8*, i32) nounwind
-declare i32 @llvm.objectsize.i32.p0i8(i8*, i1) nounwind readonly
+declare i32 @llvm.objectsize.i32.p0i8(i8*, i1, i1) nounwind readonly
diff --git a/test/Transforms/InstCombine/strcpy_chk-1.ll b/test/Transforms/InstCombine/strcpy_chk-1.ll
index 7a21a49c993c..824776c6ca18 100644
--- a/test/Transforms/InstCombine/strcpy_chk-1.ll
+++ b/test/Transforms/InstCombine/strcpy_chk-1.ll
@@ -64,10 +64,10 @@ define i8* @test_simplify5() {
   %dst = getelementptr inbounds [60 x i8], [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8], [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: %len = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false)
+; CHECK-NEXT: %len = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false, i1 false)
 ; CHECK-NEXT: %1 = call i8* @__memcpy_chk(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str, i32 0, i32 0), i32 12, i32 %len)
 ; CHECK-NEXT: ret i8* %1
-  %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false)
+  %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false, i1 false)
   %ret = call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 %len)
   ret i8* %ret
 }
@@ -78,10 +78,10 @@ define i8* @test_simplify6() {
 ; CHECK-LABEL: @test_simplify6(
   %dst = getelementptr inbounds [60 x i8], [60 x i8]* @a, i32 0, i32 0
 
-; CHECK-NEXT: %len = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false)
+; CHECK-NEXT: %len = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false, i1 false)
 ; CHECK-NEXT: %ret = call i8* @__strcpy_chk(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i32 %len)
 ; CHECK-NEXT: ret i8* %ret
-  %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false)
+  %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false, i1 false)
   %ret = call i8* @__strcpy_chk(i8* %dst, i8* %dst, i32 %len)
   ret i8* %ret
 }
@@ -100,4 +100,4 @@ define i8* @test_no_simplify1() {
 }
 
 declare i8* @__strcpy_chk(i8*, i8*, i32) nounwind
-declare i32 @llvm.objectsize.i32.p0i8(i8*, i1) nounwind readonly
+declare i32 @llvm.objectsize.i32.p0i8(i8*, i1, i1) nounwind readonly
diff --git a/test/Transforms/InstCombine/sub-xor.ll b/test/Transforms/InstCombine/sub-xor.ll
index 9a0814c2c92f..812305d8e489 100644
--- a/test/Transforms/InstCombine/sub-xor.ll
+++ b/test/Transforms/InstCombine/sub-xor.ll
@@ -48,13 +48,3 @@ define i32 @test3(i32 %x) {
   ret i32 %add
 }
 
-define i32 @test4(i32 %x) {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 %x, -2147483606
-; CHECK-NEXT:    ret i32 [[ADD]]
-;
-  %sub = xor i32 %x, 2147483648
-  %add = add i32 %sub, 42
-  ret i32 %add
-}
-
diff --git a/test/Transforms/InstCombine/sub.ll b/test/Transforms/InstCombine/sub.ll
index 32541f1f893e..2388301c726e 100644
--- a/test/Transforms/InstCombine/sub.ll
+++ b/test/Transforms/InstCombine/sub.ll
@@ -15,7 +15,7 @@ define i32 @test1(i32 %A) {
 
 define i32 @test2(i32 %A) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    ret i32 %A
+; CHECK-NEXT:    ret i32 [[A:%.*]]
 ;
   %B = sub i32 %A, 0
   ret i32 %B
@@ -23,7 +23,7 @@ define i32 @test2(i32 %A) {
 
 define i32 @test3(i32 %A) {
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT:    ret i32 %A
+; CHECK-NEXT:    ret i32 [[A:%.*]]
 ;
   %B = sub i32 0, %A
   %C = sub i32 0, %B
@@ -32,7 +32,7 @@ define i32 @test3(i32 %A) {
 
 define i32 @test4(i32 %A, i32 %x) {
 ; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[C:%.*]] = add i32 %x, %A
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[X:%.*]], [[A:%.*]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
   %B = sub i32 0, %A
@@ -42,8 +42,8 @@ define i32 @test4(i32 %A, i32 %x) {
 
 define i32 @test5(i32 %A, i32 %B, i32 %C) {
 ; CHECK-LABEL: @test5(
-; CHECK-NEXT:    [[D1:%.*]] = sub i32 %C, %B
-; CHECK-NEXT:    [[E:%.*]] = add i32 [[D1]], %A
+; CHECK-NEXT:    [[D1:%.*]] = sub i32 [[C:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[E:%.*]] = add i32 [[D1]], [[A:%.*]]
 ; CHECK-NEXT:    ret i32 [[E]]
 ;
   %D = sub i32 %B, %C
@@ -53,8 +53,8 @@ define i32 @test5(i32 %A, i32 %B, i32 %C) {
 
 define i32 @test6(i32 %A, i32 %B) {
 ; CHECK-LABEL: @test6(
-; CHECK-NEXT:    [[B_NOT:%.*]] = xor i32 %B, -1
-; CHECK-NEXT:    [[D:%.*]] = and i32 %A, [[B_NOT]]
+; CHECK-NEXT:    [[B_NOT:%.*]] = xor i32 [[B:%.*]], -1
+; CHECK-NEXT:    [[D:%.*]] = and i32 [[B_NOT]], [[A:%.*]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %C = and i32 %A, %B
@@ -62,9 +62,20 @@ define i32 @test6(i32 %A, i32 %B) {
   ret i32 %D
 }
 
+define i32 @test6commuted(i32 %A, i32 %B) {
+; CHECK-LABEL: @test6commuted(
+; CHECK-NEXT:    [[B_NOT:%.*]] = xor i32 [[B:%.*]], -1
+; CHECK-NEXT:    [[D:%.*]] = and i32 [[B_NOT]], [[A:%.*]]
+; CHECK-NEXT:    ret i32 [[D]]
+;
+  %C = and i32 %B, %A
+  %D = sub i32 %A, %C
+  ret i32 %D
+}
+
 define i32 @test7(i32 %A) {
 ; CHECK-LABEL: @test7(
-; CHECK-NEXT:    [[B:%.*]] = xor i32 %A, -1
+; CHECK-NEXT:    [[B:%.*]] = xor i32 [[A:%.*]], -1
 ; CHECK-NEXT:    ret i32 [[B]]
 ;
   %B = sub i32 -1, %A
@@ -73,7 +84,7 @@ define i32 @test7(i32 %A) {
 
 define i32 @test8(i32 %A) {
 ; CHECK-LABEL: @test8(
-; CHECK-NEXT:    [[C:%.*]] = shl i32 %A, 3
+; CHECK-NEXT:    [[C:%.*]] = shl i32 [[A:%.*]], 3
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
   %B = mul i32 9, %A
@@ -83,7 +94,7 @@ define i32 @test8(i32 %A) {
 
 define i32 @test9(i32 %A) {
 ; CHECK-LABEL: @test9(
-; CHECK-NEXT:    [[C:%.*]] = mul i32 %A, -2
+; CHECK-NEXT:    [[C:%.*]] = mul i32 [[A:%.*]], -2
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
   %B = mul i32 3, %A
@@ -93,7 +104,7 @@ define i32 @test9(i32 %A) {
 
 define i32 @test10(i32 %A, i32 %B) {
 ; CHECK-LABEL: @test10(
-; CHECK-NEXT:    [[E:%.*]] = mul i32 %A, %B
+; CHECK-NEXT:    [[E:%.*]] = mul i32 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret i32 [[E]]
 ;
   %C = sub i32 0, %A
@@ -104,7 +115,7 @@ define i32 @test10(i32 %A, i32 %B) {
 
 define i32 @test10a(i32 %A) {
 ; CHECK-LABEL: @test10a(
-; CHECK-NEXT:    [[E:%.*]] = mul i32 %A, -7
+; CHECK-NEXT:    [[E:%.*]] = mul i32 [[A:%.*]], -7
 ; CHECK-NEXT:    ret i32 [[E]]
 ;
   %C = sub i32 0, %A
@@ -114,7 +125,7 @@ define i32 @test10a(i32 %A) {
 
 define i1 @test11(i8 %A, i8 %B) {
 ; CHECK-LABEL: @test11(
-; CHECK-NEXT:    [[D:%.*]] = icmp ne i8 %A, %B
+; CHECK-NEXT:    [[D:%.*]] = icmp ne i8 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret i1 [[D]]
 ;
   %C = sub i8 %A, %B
@@ -124,7 +135,7 @@ define i1 @test11(i8 %A, i8 %B) {
 
 define <2 x i1> @test11vec(<2 x i8> %A, <2 x i8> %B) {
 ; CHECK-LABEL: @test11vec(
-; CHECK-NEXT:    [[D:%.*]] = icmp ne <2 x i8> %A, %B
+; CHECK-NEXT:    [[D:%.*]] = icmp ne <2 x i8> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <2 x i1> [[D]]
 ;
   %C = sub <2 x i8> %A, %B
@@ -134,7 +145,7 @@ define <2 x i1> @test11vec(<2 x i8> %A, <2 x i8> %B) {
 
 define i32 @test12(i32 %A) {
 ; CHECK-LABEL: @test12(
-; CHECK-NEXT:    [[C:%.*]] = lshr i32 %A, 31
+; CHECK-NEXT:    [[C:%.*]] = lshr i32 [[A:%.*]], 31
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
   %B = ashr i32 %A, 31
@@ -144,7 +155,7 @@ define i32 @test12(i32 %A) {
 
 define i32 @test13(i32 %A) {
 ; CHECK-LABEL: @test13(
-; CHECK-NEXT:    [[C:%.*]] = ashr i32 %A, 31
+; CHECK-NEXT:    [[C:%.*]] = ashr i32 [[A:%.*]], 31
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
   %B = lshr i32 %A, 31
@@ -154,7 +165,7 @@ define i32 @test13(i32 %A) {
 
 define <2 x i32> @test12vec(<2 x i32> %A) {
 ; CHECK-LABEL: @test12vec(
-; CHECK-NEXT:    [[C:%.*]] = lshr <2 x i32> %A, <i32 31, i32 31>
+; CHECK-NEXT:    [[C:%.*]] = lshr <2 x i32> [[A:%.*]], <i32 31, i32 31>
 ; CHECK-NEXT:    ret <2 x i32> [[C]]
 ;
   %B = ashr <2 x i32> %A, <i32 31, i32 31>
@@ -164,7 +175,7 @@ define <2 x i32> @test12vec(<2 x i32> %A) {
 
 define <2 x i32> @test13vec(<2 x i32> %A) {
 ; CHECK-LABEL: @test13vec(
-; CHECK-NEXT:    [[C:%.*]] = ashr <2 x i32> %A, <i32 31, i32 31>
+; CHECK-NEXT:    [[C:%.*]] = ashr <2 x i32> [[A:%.*]], <i32 31, i32 31>
 ; CHECK-NEXT:    ret <2 x i32> [[C]]
 ;
   %B = lshr <2 x i32> %A, <i32 31, i32 31>
@@ -174,8 +185,8 @@ define <2 x i32> @test13vec(<2 x i32> %A) {
 
 define i32 @test15(i32 %A, i32 %B) {
 ; CHECK-LABEL: @test15(
-; CHECK-NEXT:    [[C:%.*]] = sub i32 0, %A
-; CHECK-NEXT:    [[D:%.*]] = srem i32 %B, [[C]]
+; CHECK-NEXT:    [[C:%.*]] = sub i32 0, [[A:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = srem i32 [[B:%.*]], [[C]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %C = sub i32 0, %A
@@ -185,7 +196,7 @@ define i32 @test15(i32 %A, i32 %B) {
 
 define i32 @test16(i32 %A) {
 ; CHECK-LABEL: @test16(
-; CHECK-NEXT:    [[Y:%.*]] = sdiv i32 %A, -1123
+; CHECK-NEXT:    [[Y:%.*]] = sdiv i32 [[A:%.*]], -1123
 ; CHECK-NEXT:    ret i32 [[Y]]
 ;
   %X = sdiv i32 %A, 1123
@@ -197,7 +208,7 @@ define i32 @test16(i32 %A) {
 ; PR3142
 define i32 @test17(i32 %A) {
 ; CHECK-LABEL: @test17(
-; CHECK-NEXT:    [[B:%.*]] = sub i32 0, %A
+; CHECK-NEXT:    [[B:%.*]] = sub i32 0, [[A:%.*]]
 ; CHECK-NEXT:    [[C:%.*]] = sdiv i32 [[B]], 1234
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -218,7 +229,7 @@ define i64 @test18(i64 %Y) {
 
 define i32 @test19(i32 %X, i32 %Y) {
 ; CHECK-LABEL: @test19(
-; CHECK-NEXT:    ret i32 %X
+; CHECK-NEXT:    ret i32 [[X:%.*]]
 ;
   %Z = sub i32 %X, %Y
   %Q = add i32 %Z, %Y
@@ -227,7 +238,7 @@ define i32 @test19(i32 %X, i32 %Y) {
 
 define i1 @test20(i32 %g, i32 %h) {
 ; CHECK-LABEL: @test20(
-; CHECK-NEXT:    [[TMP_4:%.*]] = icmp ne i32 %h, 0
+; CHECK-NEXT:    [[TMP_4:%.*]] = icmp ne i32 [[H:%.*]], 0
 ; CHECK-NEXT:    ret i1 [[TMP_4]]
 ;
   %tmp.2 = sub i32 %g, %h
@@ -237,7 +248,7 @@ define i1 @test20(i32 %g, i32 %h) {
 
 define i1 @test21(i32 %g, i32 %h) {
 ; CHECK-LABEL: @test21(
-; CHECK-NEXT:    [[TMP_4:%.*]] = icmp ne i32 %h, 0
+; CHECK-NEXT:    [[TMP_4:%.*]] = icmp ne i32 [[H:%.*]], 0
 ; CHECK-NEXT:    ret i1 [[TMP_4]]
 ;
   %tmp.2 = sub i32 %g, %h
@@ -248,7 +259,7 @@ define i1 @test21(i32 %g, i32 %h) {
 ; PR2298
 define zeroext i1 @test22(i32 %a, i32 %b)  nounwind  {
 ; CHECK-LABEL: @test22(
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 %b, %a
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[B:%.*]], [[A:%.*]]
 ; CHECK-NEXT:    ret i1 [[TMP5]]
 ;
   %tmp2 = sub i32 0, %a
@@ -260,7 +271,7 @@ define zeroext i1 @test22(i32 %a, i32 %b)  nounwind  {
 ; rdar://7362831
 define i32 @test23(i8* %P, i64 %A){
 ; CHECK-LABEL: @test23(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 %A to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[A:%.*]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %B = getelementptr inbounds i8, i8* %P, i64 %A
@@ -274,7 +285,7 @@ define i32 @test23(i8* %P, i64 %A){
 
 define i8 @test23_as1(i8 addrspace(1)* %P, i16 %A) {
 ; CHECK-LABEL: @test23_as1(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i16 %A to i8
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i16 [[A:%.*]] to i8
 ; CHECK-NEXT:    ret i8 [[TMP1]]
 ;
   %B = getelementptr inbounds i8, i8 addrspace(1)* %P, i16 %A
@@ -288,7 +299,7 @@ define i8 @test23_as1(i8 addrspace(1)* %P, i16 %A) {
 
 define i64 @test24(i8* %P, i64 %A){
 ; CHECK-LABEL: @test24(
-; CHECK-NEXT:    ret i64 %A
+; CHECK-NEXT:    ret i64 [[A:%.*]]
 ;
   %B = getelementptr inbounds i8, i8* %P, i64 %A
   %C = ptrtoint i8* %B to i64
@@ -299,7 +310,7 @@ define i64 @test24(i8* %P, i64 %A){
 
 define i16 @test24_as1(i8 addrspace(1)* %P, i16 %A) {
 ; CHECK-LABEL: @test24_as1(
-; CHECK-NEXT:    ret i16 %A
+; CHECK-NEXT:    ret i16 [[A:%.*]]
 ;
   %B = getelementptr inbounds i8, i8 addrspace(1)* %P, i16 %A
   %C = ptrtoint i8 addrspace(1)* %B to i16
@@ -310,7 +321,7 @@ define i16 @test24_as1(i8 addrspace(1)* %P, i16 %A) {
 
 define i64 @test24a(i8* %P, i64 %A){
 ; CHECK-LABEL: @test24a(
-; CHECK-NEXT:    [[DIFF_NEG:%.*]] = sub i64 0, %A
+; CHECK-NEXT:    [[DIFF_NEG:%.*]] = sub i64 0, [[A:%.*]]
 ; CHECK-NEXT:    ret i64 [[DIFF_NEG]]
 ;
   %B = getelementptr inbounds i8, i8* %P, i64 %A
@@ -322,7 +333,7 @@ define i64 @test24a(i8* %P, i64 %A){
 
 define i16 @test24a_as1(i8 addrspace(1)* %P, i16 %A) {
 ; CHECK-LABEL: @test24a_as1(
-; CHECK-NEXT:    [[DIFF_NEG:%.*]] = sub i16 0, %A
+; CHECK-NEXT:    [[DIFF_NEG:%.*]] = sub i16 0, [[A:%.*]]
 ; CHECK-NEXT:    ret i16 [[DIFF_NEG]]
 ;
   %B = getelementptr inbounds i8, i8 addrspace(1)* %P, i16 %A
@@ -337,7 +348,7 @@ define i16 @test24a_as1(i8 addrspace(1)* %P, i16 %A) {
 
 define i64 @test24b(i8* %P, i64 %A){
 ; CHECK-LABEL: @test24b(
-; CHECK-NEXT:    [[B_IDX:%.*]] = shl nuw i64 %A, 1
+; CHECK-NEXT:    [[B_IDX:%.*]] = shl nuw i64 [[A:%.*]], 1
 ; CHECK-NEXT:    ret i64 [[B_IDX]]
 ;
   %B = getelementptr inbounds [42 x i16], [42 x i16]* @Arr, i64 0, i64 %A
@@ -349,7 +360,7 @@ define i64 @test24b(i8* %P, i64 %A){
 
 define i64 @test25(i8* %P, i64 %A){
 ; CHECK-LABEL: @test25(
-; CHECK-NEXT:    [[B_IDX:%.*]] = shl nuw i64 %A, 1
+; CHECK-NEXT:    [[B_IDX:%.*]] = shl nuw i64 [[A:%.*]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[B_IDX]], -84
 ; CHECK-NEXT:    ret i64 [[TMP1]]
 ;
@@ -363,7 +374,7 @@ define i64 @test25(i8* %P, i64 %A){
 
 define i16 @test25_as1(i8 addrspace(1)* %P, i64 %A) {
 ; CHECK-LABEL: @test25_as1(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 %A to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[A:%.*]] to i16
 ; CHECK-NEXT:    [[B_IDX:%.*]] = shl nuw i16 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i16 [[B_IDX]], -84
 ; CHECK-NEXT:    ret i16 [[TMP2]]
@@ -376,7 +387,7 @@ define i16 @test25_as1(i8 addrspace(1)* %P, i64 %A) {
 
 define i32 @test26(i32 %x) {
 ; CHECK-LABEL: @test26(
-; CHECK-NEXT:    [[NEG:%.*]] = shl i32 -3, %x
+; CHECK-NEXT:    [[NEG:%.*]] = shl i32 -3, [[X:%.*]]
 ; CHECK-NEXT:    ret i32 [[NEG]]
 ;
   %shl = shl i32 3, %x
@@ -386,8 +397,8 @@ define i32 @test26(i32 %x) {
 
 define i32 @test27(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test27(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 %y, 3
-; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[TMP1]], %x
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[Y:%.*]], 3
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i32 [[SUB]]
 ;
   %mul = mul i32 %y, -8
@@ -395,10 +406,87 @@ define i32 @test27(i32 %x, i32 %y) {
   ret i32 %sub
 }
 
+define <2 x i32> @test27vec(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @test27vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <2 x i32> [[Y:%.*]], <i32 8, i32 6>
+; CHECK-NEXT:    [[SUB:%.*]] = add <2 x i32> [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[SUB]]
+;
+  %mul = mul <2 x i32> %y, <i32 -8, i32 -6>
+  %sub = sub <2 x i32> %x, %mul
+  ret <2 x i32> %sub
+}
+
+define <2 x i32> @test27vecsplat(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @test27vecsplat(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i32> [[Y:%.*]], <i32 3, i32 3>
+; CHECK-NEXT:    [[SUB:%.*]] = add <2 x i32> [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[SUB]]
+;
+  %mul = mul <2 x i32> %y, <i32 -8, i32 -8>
+  %sub = sub <2 x i32> %x, %mul
+  ret <2 x i32> %sub
+}
+
+define <2 x i32> @test27vecmixed(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @test27vecmixed(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <2 x i32> [[Y:%.*]], <i32 8, i32 -8>
+; CHECK-NEXT:    [[SUB:%.*]] = add <2 x i32> [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[SUB]]
+;
+  %mul = mul <2 x i32> %y, <i32 -8, i32 8>
+  %sub = sub <2 x i32> %x, %mul
+  ret <2 x i32> %sub
+}
+
+define i32 @test27commuted(i32 %x, i32 %y) {
+; CHECK-LABEL: @test27commuted(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[Y:%.*]], 3
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[SUB]]
+;
+  %mul = mul i32 -8, %y
+  %sub = sub i32 %x, %mul
+  ret i32 %sub
+}
+
+define <2 x i32> @test27commutedvec(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @test27commutedvec(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <2 x i32> [[Y:%.*]], <i32 8, i32 6>
+; CHECK-NEXT:    [[SUB:%.*]] = add <2 x i32> [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[SUB]]
+;
+  %mul = mul <2 x i32> <i32 -8, i32 -6>, %y
+  %sub = sub <2 x i32> %x, %mul
+  ret <2 x i32> %sub
+}
+
+define <2 x i32> @test27commutedvecsplat(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @test27commutedvecsplat(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i32> [[Y:%.*]], <i32 3, i32 3>
+; CHECK-NEXT:    [[SUB:%.*]] = add <2 x i32> [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[SUB]]
+;
+  %mul = mul <2 x i32> <i32 -8, i32 -8>, %y
+  %sub = sub <2 x i32> %x, %mul
+  ret <2 x i32> %sub
+}
+
+define <2 x i32> @test27commutedvecmixed(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @test27commutedvecmixed(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <2 x i32> [[Y:%.*]], <i32 8, i32 -8>
+; CHECK-NEXT:    [[SUB:%.*]] = add <2 x i32> [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[SUB]]
+;
+  %mul = mul <2 x i32> <i32 -8, i32 8>, %y
+  %sub = sub <2 x i32> %x, %mul
+  ret <2 x i32> %sub
+}
+
 define i32 @test28(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @test28(
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 %z, %y
-; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[TMP1]], %x
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[Z:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i32 [[SUB]]
 ;
   %neg = sub i32 0, %z
@@ -407,9 +495,21 @@ define i32 @test28(i32 %x, i32 %y, i32 %z) {
   ret i32 %sub
 }
 
+define i32 @test28commuted(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @test28commuted(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[SUB]]
+;
+  %neg = sub i32 0, %z
+  %mul = mul i32 %y, %neg
+  %sub = sub i32 %x, %mul
+  ret i32 %sub
+}
+
 define i64 @test29(i8* %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test29(
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 %i, %j
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[I:%.*]], [[J:%.*]]
 ; CHECK-NEXT:    ret i64 [[TMP1]]
 ;
   %gep1 = getelementptr inbounds i8, i8* %foo, i64 %i
@@ -422,8 +522,8 @@ define i64 @test29(i8* %foo, i64 %i, i64 %j) {
 
 define i64 @test30(i8* %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test30(
-; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nuw i64 %i, 2
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[GEP1_IDX]], %j
+; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nuw i64 [[I:%.*]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[GEP1_IDX]], [[J:%.*]]
 ; CHECK-NEXT:    ret i64 [[TMP1]]
 ;
   %bit = bitcast i8* %foo to i32*
@@ -437,8 +537,8 @@ define i64 @test30(i8* %foo, i64 %i, i64 %j) {
 
 define i16 @test30_as1(i8 addrspace(1)* %foo, i16 %i, i16 %j) {
 ; CHECK-LABEL: @test30_as1(
-; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nuw i16 %i, 2
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i16 [[GEP1_IDX]], %j
+; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nuw i16 [[I:%.*]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i16 [[GEP1_IDX]], [[J:%.*]]
 ; CHECK-NEXT:    ret i16 [[TMP1]]
 ;
   %bit = bitcast i8 addrspace(1)* %foo to i32 addrspace(1)*
@@ -452,7 +552,7 @@ define i16 @test30_as1(i8 addrspace(1)* %foo, i16 %i, i16 %j) {
 
 define <2 x i64> @test31(<2 x i64> %A) {
 ; CHECK-LABEL: @test31(
-; CHECK-NEXT:    [[SUB:%.*]] = add <2 x i64> %A, <i64 3, i64 4>
+; CHECK-NEXT:    [[SUB:%.*]] = add <2 x i64> [[A:%.*]], <i64 3, i64 4>
 ; CHECK-NEXT:    ret <2 x i64> [[SUB]]
 ;
   %xor = xor <2 x i64> %A, <i64 -1, i64 -1>
@@ -462,7 +562,7 @@ define <2 x i64> @test31(<2 x i64> %A) {
 
 define <2 x i64> @test32(<2 x i64> %A) {
 ; CHECK-LABEL: @test32(
-; CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> <i64 3, i64 4>, %A
+; CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> <i64 3, i64 4>, [[A:%.*]]
 ; CHECK-NEXT:    ret <2 x i64> [[SUB]]
 ;
   %add = add <2 x i64> %A, <i64 -1, i64 -1>
@@ -472,7 +572,7 @@ define <2 x i64> @test32(<2 x i64> %A) {
 
 define <2 x i64> @test33(<2 x i1> %A) {
 ; CHECK-LABEL: @test33(
-; CHECK-NEXT:    [[SUB:%.*]] = sext <2 x i1> %A to <2 x i64>
+; CHECK-NEXT:    [[SUB:%.*]] = sext <2 x i1> [[A:%.*]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[SUB]]
 ;
   %ext = zext <2 x i1> %A to <2 x i64>
@@ -482,7 +582,7 @@ define <2 x i64> @test33(<2 x i1> %A) {
 
 define <2 x i64> @test34(<2 x i1> %A) {
 ; CHECK-LABEL: @test34(
-; CHECK-NEXT:    [[SUB:%.*]] = zext <2 x i1> %A to <2 x i64>
+; CHECK-NEXT:    [[SUB:%.*]] = zext <2 x i1> [[A:%.*]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[SUB]]
 ;
   %ext = sext <2 x i1> %A to <2 x i64>
@@ -492,7 +592,7 @@ define <2 x i64> @test34(<2 x i1> %A) {
 
 define <2 x i64> @test35(<2 x i64> %A) {
 ; CHECK-LABEL: @test35(
-; CHECK-NEXT:    [[SUB:%.*]] = mul <2 x i64> %A, <i64 -2, i64 -3>
+; CHECK-NEXT:    [[SUB:%.*]] = mul <2 x i64> [[A:%.*]], <i64 -2, i64 -3>
 ; CHECK-NEXT:    ret <2 x i64> [[SUB]]
 ;
   %mul = mul <2 x i64> %A, <i64 3, i64 4>
@@ -502,7 +602,7 @@ define <2 x i64> @test35(<2 x i64> %A) {
 
 define <2 x i64> @test36(<2 x i64> %A) {
 ; CHECK-LABEL: @test36(
-; CHECK-NEXT:    [[SUB:%.*]] = mul <2 x i64> %A, <i64 7, i64 15>
+; CHECK-NEXT:    [[SUB:%.*]] = mul <2 x i64> [[A:%.*]], <i64 7, i64 15>
 ; CHECK-NEXT:    ret <2 x i64> [[SUB]]
 ;
   %shl = shl <2 x i64> %A, <i64 3, i64 4>
@@ -512,7 +612,7 @@ define <2 x i64> @test36(<2 x i64> %A) {
 
 define <2 x i32> @test37(<2 x i32> %A) {
 ; CHECK-LABEL: @test37(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i32> %A, <i32 -2147483648, i32 -2147483648>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i32> [[A:%.*]], <i32 -2147483648, i32 -2147483648>
 ; CHECK-NEXT:    [[SUB:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
 ; CHECK-NEXT:    ret <2 x i32> [[SUB]]
 ;
@@ -523,7 +623,7 @@ define <2 x i32> @test37(<2 x i32> %A) {
 
 define i32 @test38(i32 %A) {
 ; CHECK-LABEL: @test38(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 %A, -2147483648
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[A:%.*]], -2147483648
 ; CHECK-NEXT:    [[SUB:%.*]] = sext i1 [[TMP1]] to i32
 ; CHECK-NEXT:    ret i32 [[SUB]]
 ;
@@ -534,7 +634,7 @@ define i32 @test38(i32 %A) {
 
 define i32 @test39(i32 %A, i32 %x) {
 ; CHECK-LABEL: @test39(
-; CHECK-NEXT:    [[C:%.*]] = add i32 %x, %A
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[X:%.*]], [[A:%.*]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
   %B = sub i32 0, %A
@@ -544,8 +644,8 @@ define i32 @test39(i32 %A, i32 %x) {
 
 define i16 @test40(i16 %a, i16 %b) {
 ; CHECK-LABEL: @test40(
-; CHECK-NEXT:    [[ASHR:%.*]] = ashr i16 %a, 1
-; CHECK-NEXT:    [[ASHR1:%.*]] = ashr i16 %b, 1
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i16 [[A:%.*]], 1
+; CHECK-NEXT:    [[ASHR1:%.*]] = ashr i16 [[B:%.*]], 1
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i16 [[ASHR]], [[ASHR1]]
 ; CHECK-NEXT:    ret i16 [[SUB]]
 ;
@@ -557,8 +657,8 @@ define i16 @test40(i16 %a, i16 %b) {
 
 define i32 @test41(i16 %a, i16 %b) {
 ; CHECK-LABEL: @test41(
-; CHECK-NEXT:    [[CONV:%.*]] = sext i16 %a to i32
-; CHECK-NEXT:    [[CONV1:%.*]] = sext i16 %b to i32
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[A:%.*]] to i32
+; CHECK-NEXT:    [[CONV1:%.*]] = sext i16 [[B:%.*]] to i32
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[CONV]], [[CONV1]]
 ; CHECK-NEXT:    ret i32 [[SUB]]
 ;
@@ -570,8 +670,8 @@ define i32 @test41(i16 %a, i16 %b) {
 
 define i4 @test42(i4 %x, i4 %y) {
 ; CHECK-LABEL: @test42(
-; CHECK-NEXT:    [[A:%.*]] = and i4 %y, 7
-; CHECK-NEXT:    [[B:%.*]] = and i4 %x, 7
+; CHECK-NEXT:    [[A:%.*]] = and i4 [[Y:%.*]], 7
+; CHECK-NEXT:    [[B:%.*]] = and i4 [[X:%.*]], 7
 ; CHECK-NEXT:    [[C:%.*]] = sub nsw i4 [[A]], [[B]]
 ; CHECK-NEXT:    ret i4 [[C]]
 ;
@@ -583,8 +683,8 @@ define i4 @test42(i4 %x, i4 %y) {
 
 define i4 @test43(i4 %x, i4 %y) {
 ; CHECK-LABEL: @test43(
-; CHECK-NEXT:    [[A:%.*]] = or i4 %x, -8
-; CHECK-NEXT:    [[B:%.*]] = and i4 %y, 7
+; CHECK-NEXT:    [[A:%.*]] = or i4 [[X:%.*]], -8
+; CHECK-NEXT:    [[B:%.*]] = and i4 [[Y:%.*]], 7
 ; CHECK-NEXT:    [[C:%.*]] = sub nuw i4 [[A]], [[B]]
 ; CHECK-NEXT:    ret i4 [[C]]
 ;
@@ -596,7 +696,7 @@ define i4 @test43(i4 %x, i4 %y) {
 
 define i32 @test44(i32 %x) {
 ; CHECK-LABEL: @test44(
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 %x, -32768
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[X:%.*]], -32768
 ; CHECK-NEXT:    ret i32 [[SUB]]
 ;
   %sub = sub nsw i32 %x, 32768
@@ -605,7 +705,7 @@ define i32 @test44(i32 %x) {
 
 define i32 @test45(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test45(
-; CHECK-NEXT:    [[SUB:%.*]] = and i32 %x, %y
+; CHECK-NEXT:    [[SUB:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i32 [[SUB]]
 ;
   %or = or i32 %x, %y
@@ -614,10 +714,21 @@ define i32 @test45(i32 %x, i32 %y) {
   ret i32 %sub
 }
 
+define i32 @test45commuted(i32 %x, i32 %y) {
+; CHECK-LABEL: @test45commuted(
+; CHECK-NEXT:    [[SUB:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[SUB]]
+;
+  %or = or i32 %x, %y
+  %xor = xor i32 %y, %x
+  %sub = sub i32 %or, %xor
+  ret i32 %sub
+}
+
 define i32 @test46(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test46(
-; CHECK-NEXT:    [[X_NOT:%.*]] = xor i32 %x, -1
-; CHECK-NEXT:    [[SUB:%.*]] = and i32 %y, [[X_NOT]]
+; CHECK-NEXT:    [[X_NOT:%.*]] = xor i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[SUB:%.*]] = and i32 [[X_NOT]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i32 [[SUB]]
 ;
   %or = or i32 %x, %y
@@ -625,10 +736,21 @@ define i32 @test46(i32 %x, i32 %y) {
   ret i32 %sub
 }
 
+define i32 @test46commuted(i32 %x, i32 %y) {
+; CHECK-LABEL: @test46commuted(
+; CHECK-NEXT:    [[X_NOT:%.*]] = xor i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[SUB:%.*]] = and i32 [[X_NOT]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[SUB]]
+;
+  %or = or i32 %y, %x
+  %sub = sub i32 %or, %x
+  ret i32 %sub
+}
+
 define i32 @test47(i1 %A, i32 %B, i32 %C, i32 %D) {
 ; CHECK-LABEL: @test47(
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 %D, %C
-; CHECK-NEXT:    [[SUB:%.*]] = select i1 %A, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[D:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[SUB:%.*]] = select i1 [[A:%.*]], i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    ret i32 [[SUB]]
 ;
   %sel0 = select i1 %A, i32 %D, i32 %B
@@ -639,8 +761,8 @@ define i32 @test47(i1 %A, i32 %B, i32 %C, i32 %D) {
 
 define i32 @test48(i1 %A, i32 %B, i32 %C, i32 %D) {
 ; CHECK-LABEL: @test48(
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 %D, %C
-; CHECK-NEXT:    [[SUB:%.*]] = select i1 %A, i32 0, i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[D:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[SUB:%.*]] = select i1 [[A:%.*]], i32 0, i32 [[TMP1]]
 ; CHECK-NEXT:    ret i32 [[SUB]]
 ;
   %sel0 = select i1 %A, i32 %B, i32 %D
@@ -653,8 +775,8 @@ define i32 @test48(i1 %A, i32 %B, i32 %C, i32 %D) {
 
 define i8 @bool_sext_sub(i8 %x, i1 %y) {
 ; CHECK-LABEL: @bool_sext_sub(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i1 %y to i8
-; CHECK-NEXT:    [[SUB:%.*]] = add i8 [[TMP1]], %x
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i1 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[SUB:%.*]] = add i8 [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i8 [[SUB]]
 ;
   %sext = sext i1 %y to i8
@@ -666,8 +788,8 @@ define i8 @bool_sext_sub(i8 %x, i1 %y) {
 
 define <2 x i8> @bool_sext_sub_vec(<2 x i8> %x, <2 x i1> %y) {
 ; CHECK-LABEL: @bool_sext_sub_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i1> %y to <2 x i8>
-; CHECK-NEXT:    [[SUB:%.*]] = add <2 x i8> [[TMP1]], %x
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i1> [[Y:%.*]] to <2 x i8>
+; CHECK-NEXT:    [[SUB:%.*]] = add <2 x i8> [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x i8> [[SUB]]
 ;
   %sext = sext <2 x i1> %y to <2 x i8>
@@ -679,8 +801,8 @@ define <2 x i8> @bool_sext_sub_vec(<2 x i8> %x, <2 x i1> %y) {
 
 define <2 x i8> @bool_sext_sub_vec_nsw(<2 x i8> %x, <2 x i1> %y) {
 ; CHECK-LABEL: @bool_sext_sub_vec_nsw(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i1> %y to <2 x i8>
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw <2 x i8> [[TMP1]], %x
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i1> [[Y:%.*]] to <2 x i8>
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw <2 x i8> [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x i8> [[SUB]]
 ;
   %sext = sext <2 x i1> %y to <2 x i8>
@@ -692,8 +814,8 @@ define <2 x i8> @bool_sext_sub_vec_nsw(<2 x i8> %x, <2 x i1> %y) {
 
 define i8 @bool_sext_sub_nuw(i8 %x, i1 %y) {
 ; CHECK-LABEL: @bool_sext_sub_nuw(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i1 %y to i8
-; CHECK-NEXT:    [[SUB:%.*]] = add i8 [[TMP1]], %x
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i1 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[SUB:%.*]] = add i8 [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i8 [[SUB]]
 ;
   %sext = sext i1 %y to i8
@@ -701,3 +823,169 @@ define i8 @bool_sext_sub_nuw(i8 %x, i1 %y) {
   ret i8 %sub
 }
 
+define i32 @test49(i32 %X) {
+; CHECK-LABEL: @test49(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 1, [[X:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = and i32 [[SUB]], 64
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %sub = sub i32 129, %X
+  %res = and i32 %sub, 64
+  ret i32 %res
+}
+
+define i32 @test50(i32 %X) {
+; CHECK-LABEL: @test50(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 1, [[X:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = and i32 [[SUB]], 127
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %sub = sub i32 129, %X
+  %res = and i32 %sub, 127
+  ret i32 %res
+}
+
+define i32 @test51(i32 %X) {
+; CHECK-LABEL: @test51(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 126, [[X:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = and i32 [[SUB]], 64
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %sub = sub i32 254, %X
+  %res = and i32 %sub, 64
+  ret i32 %res
+}
+
+define i32 @test52(i32 %X) {
+; CHECK-LABEL: @test52(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 126, [[X:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = and i32 [[SUB]], 127
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %sub = sub i32 254, %X
+  %res = and i32 %sub, 127
+  ret i32 %res
+}
+
+define <2 x i1> @test53(<2 x i1> %A, <2 x i1> %B) {
+; CHECK-LABEL: @test53(
+; CHECK-NEXT:    [[SUB:%.*]] = xor <2 x i1> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <2 x i1> [[SUB]]
+;
+  %sub = sub <2 x i1> %A, %B
+  ret <2 x i1> %sub
+}
+
+define i32 @test54(i1 %C) {
+; CHECK-LABEL: @test54(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], i32 -877, i32 113
+; CHECK-NEXT:    ret i32 [[V]]
+;
+  %A = select i1 %C, i32 1000, i32 10
+  %V = sub i32 123, %A
+  ret i32 %V
+}
+
+define <2 x i32> @test54vec(i1 %C) {
+; CHECK-LABEL: @test54vec(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], <2 x i32> <i32 -877, i32 -877>, <2 x i32> <i32 113, i32 113>
+; CHECK-NEXT:    ret <2 x i32> [[V]]
+;
+  %A = select i1 %C, <2 x i32> <i32 1000, i32 1000>, <2 x i32> <i32 10, i32 10>
+  %V = sub <2 x i32> <i32 123, i32 123>, %A
+  ret <2 x i32> %V
+}
+
+define <2 x i32> @test54vec2(i1 %C) {
+; CHECK-LABEL: @test54vec2(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], <2 x i32> <i32 -877, i32 -2167>, <2 x i32> <i32 113, i32 303>
+; CHECK-NEXT:    ret <2 x i32> [[V]]
+;
+  %A = select i1 %C, <2 x i32> <i32 1000, i32 2500>, <2 x i32> <i32 10, i32 30>
+  %V = sub <2 x i32> <i32 123, i32 333>, %A
+  ret <2 x i32> %V
+}
+
+define i32 @test55(i1 %which) {
+; CHECK-LABEL: @test55(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi i32 [ -877, [[ENTRY:%.*]] ], [ 113, [[DELAY]] ]
+; CHECK-NEXT:    ret i32 [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi i32 [ 1000, %entry ], [ 10, %delay ]
+  %value = sub i32 123, %A
+  ret i32 %value
+}
+
+define <2 x i32> @test55vec(i1 %which) {
+; CHECK-LABEL: @test55vec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi <2 x i32> [ <i32 -877, i32 -877>, [[ENTRY:%.*]] ], [ <i32 113, i32 113>, [[DELAY]] ]
+; CHECK-NEXT:    ret <2 x i32> [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi <2 x i32> [ <i32 1000, i32 1000>, %entry ], [ <i32 10, i32 10>, %delay ]
+  %value = sub <2 x i32> <i32 123, i32 123>, %A
+  ret <2 x i32> %value
+}
+
+define <2 x i32> @test55vec2(i1 %which) {
+; CHECK-LABEL: @test55vec2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi <2 x i32> [ <i32 -877, i32 -2167>, [[ENTRY:%.*]] ], [ <i32 113, i32 303>, [[DELAY]] ]
+; CHECK-NEXT:    ret <2 x i32> [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi <2 x i32> [ <i32 1000, i32 2500>, %entry ], [ <i32 10, i32 30>, %delay ]
+  %value = sub <2 x i32> <i32 123, i32 333>, %A
+  ret <2 x i32> %value
+}
+
+define i32 @test56(i32 %A, i32 %B) {
+; CHECK-LABEL: @test56(
+; CHECK-NEXT:    [[Y:%.*]] = sub i32 0, [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[Y]]
+;
+  %X = add i32 %A, %B
+  %Y = sub i32 %A, %X
+  ret i32 %Y                                                                                                                                                                                                                                             }
+
+define i32 @test57(i32 %A, i32 %B) {
+; CHECK-LABEL: @test57(
+; CHECK-NEXT:    [[Y:%.*]] = sub i32 0, [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[Y]]
+;
+  %X = add i32 %B, %A
+  %Y = sub i32 %A, %X
+  ret i32 %Y                                                                                                                                                                                                                                             }
diff --git a/test/Transforms/InstCombine/trunc.ll b/test/Transforms/InstCombine/trunc.ll
index eaa45bbb286c..5597b578f017 100644
--- a/test/Transforms/InstCombine/trunc.ll
+++ b/test/Transforms/InstCombine/trunc.ll
@@ -119,8 +119,8 @@ define i64 @test8(i32 %A, i32 %B) {
 
 define i8 @test9(i32 %X) {
 ; CHECK-LABEL: @test9(
-; CHECK-NEXT:    [[X_TR:%.*]] = trunc i32 %X to i8
-; CHECK-NEXT:    [[Z:%.*]] = and i8 [[X_TR]], 42
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 %X to i8
+; CHECK-NEXT:    [[Z:%.*]] = and i8 [[TMP1]], 42
 ; CHECK-NEXT:    ret i8 [[Z]]
 ;
   %Y = and i32 %X, 42
@@ -464,3 +464,72 @@ define <8 x i16> @trunc_shl_v8i16_v8i32_4(<8 x i32> %a) {
   ret <8 x i16> %conv
 }
 
+; Although the mask is the same value, we don't create a shuffle for types that the backend may not be able to handle:
+; trunc (shuffle X, C, Mask) --> shuffle (trunc X), C', Mask
+
+define <4 x i8> @wide_shuf(<4 x i32> %x) {
+; CHECK-LABEL: @wide_shuf(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> %x, <4 x i32> <i32 undef, i32 3634, i32 90, i32 undef>, <4 x i32> <i32 1, i32 5, i32 6, i32 2>
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <4 x i32> [[SHUF]] to <4 x i8>
+; CHECK-NEXT:    ret <4 x i8> [[TRUNC]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> <i32 35, i32 3634, i32 90, i32 -1>, <4 x i32> <i32 1, i32 5, i32 6, i32 2>
+  %trunc = trunc <4 x i32> %shuf to <4 x i8>
+  ret <4 x i8> %trunc
+}
+
+; trunc (shuffle X, undef, SplatMask) --> shuffle (trunc X), undef, SplatMask
+
+define <4 x i8> @wide_splat1(<4 x i32> %x) {
+; CHECK-LABEL: @wide_splat1(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i32> %x to <4 x i8>
+; CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    ret <4 x i8> [[TRUNC]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  %trunc = trunc <4 x i32> %shuf to <4 x i8>
+  ret <4 x i8> %trunc
+}
+
+; Test weird types.
+; trunc (shuffle X, undef, SplatMask) --> shuffle (trunc X), undef, SplatMask
+
+define <3 x i31> @wide_splat2(<3 x i33> %x) {
+; CHECK-LABEL: @wide_splat2(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <3 x i33> %x to <3 x i31>
+; CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <3 x i31> [[TMP1]], <3 x i31> undef, <3 x i32> <i32 1, i32 1, i32 1>
+; CHECK-NEXT:    ret <3 x i31> [[TRUNC]]
+;
+  %shuf = shufflevector <3 x i33> %x, <3 x i33> undef, <3 x i32> <i32 1, i32 1, i32 1>
+  %trunc = trunc <3 x i33> %shuf to <3 x i31>
+  ret <3 x i31> %trunc
+}
+
+; FIXME:
+; trunc (shuffle X, undef, SplatMask) --> shuffle (trunc X), undef, SplatMask
+; A mask with undef elements should still be considered a splat mask.
+
+define <3 x i31> @wide_splat3(<3 x i33> %x) {
+; CHECK-LABEL: @wide_splat3(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x i33> %x, <3 x i33> undef, <3 x i32> <i32 undef, i32 1, i32 1>
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <3 x i33> [[SHUF]] to <3 x i31>
+; CHECK-NEXT:    ret <3 x i31> [[TRUNC]]
+;
+  %shuf = shufflevector <3 x i33> %x, <3 x i33> undef, <3 x i32> <i32 undef, i32 1, i32 1>
+  %trunc = trunc <3 x i33> %shuf to <3 x i31>
+  ret <3 x i31> %trunc
+}
+
+; TODO: The shuffle extends the length of the input vector. Should we shrink this?
+
+define <8 x i8> @wide_lengthening_splat(<4 x i16> %v) {
+; CHECK-LABEL: @wide_lengthening_splat(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TR:%.*]] = trunc <8 x i16> [[SHUF]] to <8 x i8>
+; CHECK-NEXT:    ret <8 x i8> [[TR]]
+;
+  %shuf = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+  %tr = trunc <8 x i16> %shuf to <8 x i8>
+  ret <8 x i8> %tr
+}
+
diff --git a/test/Transforms/InstCombine/type_pun.ll b/test/Transforms/InstCombine/type_pun.ll
index 098164cd029f..56d1ffcb5d31 100644
--- a/test/Transforms/InstCombine/type_pun.ll
+++ b/test/Transforms/InstCombine/type_pun.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
 ; Ensure that type punning using a union of vector and same-sized array
@@ -17,9 +18,10 @@ target datalayout = "p:32:32"
 ; Extracting the zeroth element in an i32 array.
 define i32 @type_pun_zeroth(<16 x i8> %in) {
 ; CHECK-LABEL: @type_pun_zeroth(
-; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x i32>
-; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 0
-; CHECK-NEXT: ret i32 %[[EXT]]
+; CHECK-NEXT:    [[SROA_BC:%.*]] = bitcast <16 x i8> [[IN:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[SROA_EXTRACT:%.*]] = extractelement <4 x i32> [[SROA_BC]], i32 0
+; CHECK-NEXT:    ret i32 [[SROA_EXTRACT]]
+;
   %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %1 = bitcast <4 x i8> %sroa to i32
   ret i32 %1
@@ -28,9 +30,10 @@ define i32 @type_pun_zeroth(<16 x i8> %in) {
 ; Extracting the first element in an i32 array.
 define i32 @type_pun_first(<16 x i8> %in) {
 ; CHECK-LABEL: @type_pun_first(
-; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x i32>
-; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 1
-; CHECK-NEXT: ret i32 %[[EXT]]
+; CHECK-NEXT:    [[SROA_BC:%.*]] = bitcast <16 x i8> [[IN:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[SROA_EXTRACT:%.*]] = extractelement <4 x i32> [[SROA_BC]], i32 1
+; CHECK-NEXT:    ret i32 [[SROA_EXTRACT]]
+;
   %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %1 = bitcast <4 x i8> %sroa to i32
   ret i32 %1
@@ -39,10 +42,11 @@ define i32 @type_pun_first(<16 x i8> %in) {
 ; Extracting an i32 that isn't aligned to any natural boundary.
 define i32 @type_pun_misaligned(<16 x i8> %in) {
 ; CHECK-LABEL: @type_pun_misaligned(
-; CHECK-NEXT: %[[SHUF:.*]] = shufflevector <16 x i8> %in, <16 x i8> undef, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %[[SHUF]] to <4 x i32>
-; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 0
-; CHECK-NEXT: ret i32 %[[EXT]]
+; CHECK-NEXT:    [[SROA_EXTRACT:%.*]] = shufflevector <16 x i8> [[IN:%.*]], <16 x i8> undef, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[SROA_BC:%.*]] = bitcast <16 x i8> [[SROA_EXTRACT]] to <4 x i32>
+; CHECK-NEXT:    [[SROA_EXTRACT1:%.*]] = extractelement <4 x i32> [[SROA_BC]], i32 0
+; CHECK-NEXT:    ret i32 [[SROA_EXTRACT1]]
+;
   %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
   %1 = bitcast <4 x i8> %sroa to i32
   ret i32 %1
@@ -51,10 +55,11 @@ define i32 @type_pun_misaligned(<16 x i8> %in) {
 ; Type punning to an array of pointers.
 define i32* @type_pun_pointer(<16 x i8> %in) {
 ; CHECK-LABEL: @type_pun_pointer(
-; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x i32>
-; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x i32> %[[BC]], i32 0
-; CHECK-NEXT: %[[I2P:.*]] = inttoptr i32 %[[EXT]] to i32*
-; CHECK-NEXT: ret i32* %[[I2P]]
+; CHECK-NEXT:    [[SROA_BC:%.*]] = bitcast <16 x i8> [[IN:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[SROA_EXTRACT:%.*]] = extractelement <4 x i32> [[SROA_BC]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = inttoptr i32 [[SROA_EXTRACT]] to i32*
+; CHECK-NEXT:    ret i32* [[TMP1]]
+;
   %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %1 = bitcast <4 x i8> %sroa to i32
   %2 = inttoptr i32 %1 to i32*
@@ -64,9 +69,10 @@ define i32* @type_pun_pointer(<16 x i8> %in) {
 ; Type punning to an array of 32-bit floating-point values.
 define float @type_pun_float(<16 x i8> %in) {
 ; CHECK-LABEL: @type_pun_float(
-; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x float>
-; CHECK-NEXT: %[[EXT:.*]] = extractelement <4 x float> %[[BC]], i32 0
-; CHECK-NEXT: ret float %[[EXT]]
+; CHECK-NEXT:    [[SROA_BC:%.*]] = bitcast <16 x i8> [[IN:%.*]] to <4 x float>
+; CHECK-NEXT:    [[SROA_EXTRACT:%.*]] = extractelement <4 x float> [[SROA_BC]], i32 0
+; CHECK-NEXT:    ret float [[SROA_EXTRACT]]
+;
   %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %1 = bitcast <4 x i8> %sroa to float
   ret float %1
@@ -75,9 +81,10 @@ define float @type_pun_float(<16 x i8> %in) {
 ; Type punning to an array of 64-bit floating-point values.
 define double @type_pun_double(<16 x i8> %in) {
 ; CHECK-LABEL: @type_pun_double(
-; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <2 x double>
-; CHECK-NEXT: %[[EXT:.*]] = extractelement <2 x double> %[[BC]], i32 0
-; CHECK-NEXT: ret double %[[EXT]]
+; CHECK-NEXT:    [[SROA_BC:%.*]] = bitcast <16 x i8> [[IN:%.*]] to <2 x double>
+; CHECK-NEXT:    [[SROA_EXTRACT:%.*]] = extractelement <2 x double> [[SROA_BC]], i32 0
+; CHECK-NEXT:    ret double [[SROA_EXTRACT]]
+;
   %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %1 = bitcast <8 x i8> %sroa to double
   ret double %1
@@ -87,13 +94,14 @@ define double @type_pun_double(<16 x i8> %in) {
 ; Verify that multiple uses with different bitcast types are properly handled.
 define { float, i32 } @type_pun_float_i32(<16 x i8> %in) {
 ; CHECK-LABEL: @type_pun_float_i32(
-; CHECK-NEXT: %[[BCI:.*]] = bitcast <16 x i8> %in to <4 x i32>
-; CHECK-NEXT: %[[EXTI:.*]] = extractelement <4 x i32> %[[BCI]], i32 0
-; CHECK-NEXT: %[[BCF:.*]] = bitcast <16 x i8> %in to <4 x float>
-; CHECK-NEXT: %[[EXTF:.*]] = extractelement <4 x float> %[[BCF]], i32 0
-; CHECK-NEXT: %1 = insertvalue { float, i32 } undef, float %[[EXTF]], 0
-; CHECK-NEXT: %2 = insertvalue { float, i32 } %1, i32 %[[EXTI]], 1
-; CHECK-NEXT: ret { float, i32 } %2
+; CHECK-NEXT:    [[SROA_BC:%.*]] = bitcast <16 x i8> [[IN:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[SROA_EXTRACT:%.*]] = extractelement <4 x i32> [[SROA_BC]], i32 0
+; CHECK-NEXT:    [[SROA_BC1:%.*]] = bitcast <16 x i8> [[IN]] to <4 x float>
+; CHECK-NEXT:    [[SROA_EXTRACT2:%.*]] = extractelement <4 x float> [[SROA_BC1]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { float, i32 } undef, float [[SROA_EXTRACT2]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { float, i32 } [[TMP1]], i32 [[SROA_EXTRACT]], 1
+; CHECK-NEXT:    ret { float, i32 } [[TMP2]]
+;
   %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %f = bitcast <4 x i8> %sroa to float
   %i = bitcast <4 x i8> %sroa to i32
@@ -106,24 +114,29 @@ define { float, i32 } @type_pun_float_i32(<16 x i8> %in) {
 ; Verify that the bitcast is shared and dominates usage.
 define i32 @type_pun_i32_ctrl(<16 x i8> %in) {
 ; CHECK-LABEL: @type_pun_i32_ctrl(
-entry: ; CHECK-NEXT: entry:
-; CHECK-NEXT: %[[BC:.*]] = bitcast <16 x i8> %in to <4 x i32>
-; CHECK-NEXT: br
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SROA_BC:%.*]] = bitcast <16 x i8> [[IN:%.*]] to <4 x i32>
+; CHECK-NEXT:    br i1 undef, label [[LEFT:%.*]], label [[RIGHT:%.*]]
+; CHECK:       left:
+; CHECK-NEXT:    [[SROA_EXTRACT1:%.*]] = extractelement <4 x i32> [[SROA_BC]], i32 0
+; CHECK-NEXT:    br label [[TAIL:%.*]]
+; CHECK:       right:
+; CHECK-NEXT:    [[SROA_EXTRACT:%.*]] = extractelement <4 x i32> [[SROA_BC]], i32 0
+; CHECK-NEXT:    br label [[TAIL]]
+; CHECK:       tail:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[SROA_EXTRACT1]], [[LEFT]] ], [ [[SROA_EXTRACT]], [[RIGHT]] ]
+; CHECK-NEXT:    ret i32 [[I]]
+;
+entry:
   %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   br i1 undef, label %left, label %right
-left: ; CHECK: left:
-; CHECK-NEXT: %[[EXTL:.*]] = extractelement <4 x i32> %[[BC]], i32 0
-; CHECK-NEXT: br
+left:
   %lhs = bitcast <4 x i8> %sroa to i32
   br label %tail
-right: ; CHECK: right:
-; CHECK-NEXT: %[[EXTR:.*]] = extractelement <4 x i32> %[[BC]], i32 0
-; CHECK-NEXT: br
+right:
   %rhs = bitcast <4 x i8> %sroa to i32
   br label %tail
-tail: ; CHECK: tail:
-; CHECK-NEXT: %i = phi i32 [ %[[EXTL]], %left ], [ %[[EXTR]], %right ]
-; CHECK-NEXT: ret i32 %i
+tail:
   %i = phi i32 [ %lhs, %left ], [ %rhs, %right ]
   ret i32 %i
 }
@@ -132,9 +145,10 @@ tail: ; CHECK: tail:
 ; should stay the same.
 define i40 @type_pun_unhandled(<16 x i8> %in) {
 ; CHECK-LABEL: @type_pun_unhandled(
-; CHECK-NEXT: %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <5 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8>
-; CHECK-NEXT: %1 = bitcast <5 x i8> %sroa to i40
-; CHECK-NEXT: ret i40 %1
+; CHECK-NEXT:    [[SROA:%.*]] = shufflevector <16 x i8> [[IN:%.*]], <16 x i8> undef, <5 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <5 x i8> [[SROA]] to i40
+; CHECK-NEXT:    ret i40 [[TMP1]]
+;
   %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <5 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8>
   %1 = bitcast <5 x i8> %sroa to i40
   ret i40 %1
diff --git a/test/Transforms/InstCombine/urem.ll b/test/Transforms/InstCombine/urem.ll
deleted file mode 100644
index 0549d759eac4..000000000000
--- a/test/Transforms/InstCombine/urem.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-define i64 @rem_unsigned(i64 %x1, i64 %y2) {
-; CHECK-LABEL: @rem_unsigned(
-; CHECK-NEXT:    [[R:%.*]] = urem i64 %x1, %y2
-; CHECK-NEXT:    ret i64 [[R]]
-;
-  %r = udiv i64 %x1, %y2
-  %r7 = mul i64 %r, %y2
-  %r8 = sub i64 %x1, %r7
-  ret i64 %r8
-}
-
-; PR28672 - https://llvm.org/bugs/show_bug.cgi?id=28672
-
-define i8 @big_divisor(i8 %x) {
-; CHECK-LABEL: @big_divisor(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i8 %x, -127
-; CHECK-NEXT:    [[TMP2:%.*]] = add i8 %x, 127
-; CHECK-NEXT:    [[REM:%.*]] = select i1 [[TMP1]], i8 %x, i8 [[TMP2]]
-; CHECK-NEXT:    ret i8 [[REM]]
-;
-  %rem = urem i8 %x, 129
-  ret i8 %rem
-}
-
-define i5 @biggest_divisor(i5 %x) {
-; CHECK-LABEL: @biggest_divisor(
-; CHECK-NEXT:    [[NOT_:%.*]] = icmp eq i5 %x, -1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i1 [[NOT_]] to i5
-; CHECK-NEXT:    [[REM:%.*]] = add i5 [[TMP1]], %x
-; CHECK-NEXT:    ret i5 [[REM]]
-;
-  %rem = urem i5 %x, -1
-  ret i5 %rem
-}
-
-; TODO: Should vector subtract of constant be canonicalized to add?
-define <2 x i4> @big_divisor_vec(<2 x i4> %x) {
-; CHECK-LABEL: @big_divisor_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i4> %x, <i4 -3, i4 -3>
-; CHECK-NEXT:    [[TMP2:%.*]] = sub <2 x i4> %x, <i4 -3, i4 -3>
-; CHECK-NEXT:    [[REM:%.*]] = select <2 x i1> [[TMP1]], <2 x i4> %x, <2 x i4> [[TMP2]]
-; CHECK-NEXT:    ret <2 x i4> [[REM]]
-;
-  %rem = urem <2 x i4> %x, <i4 13, i4 13>
-  ret <2 x i4> %rem
-}
-
diff --git a/test/Transforms/InstCombine/vararg.ll b/test/Transforms/InstCombine/vararg.ll
index 263a7425a075..111cb4de7bc3 100644
--- a/test/Transforms/InstCombine/vararg.ll
+++ b/test/Transforms/InstCombine/vararg.ll
@@ -2,8 +2,8 @@
 
 %struct.__va_list = type { i8*, i8*, i8*, i32, i32 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 declare void @llvm.va_start(i8*)
 declare void @llvm.va_end(i8*)
 declare void @llvm.va_copy(i8*, i8*)
@@ -17,14 +17,14 @@ entry:
   %va1 = alloca %struct.__va_list, align 8
   %0 = bitcast %struct.__va_list* %va0 to i8*
   %1 = bitcast %struct.__va_list* %va1 to i8*
-  call void @llvm.lifetime.start(i64 32, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %0)
   call void @llvm.va_start(i8* %0)
-  call void @llvm.lifetime.start(i64 32, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %1)
   call void @llvm.va_copy(i8* %1, i8* %0)
   call void @llvm.va_end(i8* %1)
-  call void @llvm.lifetime.end(i64 32, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %1)
   call void @llvm.va_end(i8* %0)
-  call void @llvm.lifetime.end(i64 32, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %0)
   ret i32 0
 }
 
diff --git a/test/Transforms/InstCombine/vec_demanded_elts.ll b/test/Transforms/InstCombine/vec_demanded_elts.ll
index 7c46adaf616e..5f27634da19c 100644
--- a/test/Transforms/InstCombine/vec_demanded_elts.ll
+++ b/test/Transforms/InstCombine/vec_demanded_elts.ll
@@ -67,7 +67,7 @@ define i64 @test3(float %f, double %d) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP5]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP11]], [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP11]]
 ; CHECK-NEXT:    ret i64 [[TMP15]]
 ;
   %v00 = insertelement <4 x float> undef, float %f, i32 0
@@ -182,10 +182,9 @@ define <4 x float> @dead_shuffle_elt(<4 x float> %x, <2 x float> %y) nounwind {
 
 define <2 x float> @test_fptrunc(double %f) {
 ; CHECK-LABEL: @test_fptrunc(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %f, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double 0.000000e+00, i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = fptrunc <2 x double> [[TMP2]] to <2 x float>
-; CHECK-NEXT:    ret <2 x float> [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> <double undef, double 0.000000e+00>, double %f, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float>
+; CHECK-NEXT:    ret <2 x float> [[TMP2]]
 ;
   %tmp9 = insertelement <4 x double> undef, double %f, i32 0
   %tmp10 = insertelement <4 x double> %tmp9, double 0.000000e+00, i32 1
@@ -198,10 +197,9 @@ define <2 x float> @test_fptrunc(double %f) {
 
 define <2 x double> @test_fpext(float %f) {
 ; CHECK-LABEL: @test_fpext(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float %f, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float 0.000000e+00, i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double>
-; CHECK-NEXT:    ret <2 x double> [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> <float undef, float 0.000000e+00>, float %f, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double>
+; CHECK-NEXT:    ret <2 x double> [[TMP2]]
 ;
   %tmp9 = insertelement <4 x float> undef, float %f, i32 0
   %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 1
@@ -223,8 +221,7 @@ define <4 x double> @test_shuffle(<4 x double> %f) {
 
 define <4 x float> @test_select(float %f, float %g) {
 ; CHECK-LABEL: @test_select(
-; CHECK-NEXT:    [[A0:%.*]] = insertelement <4 x float> undef, float %f, i32 0
-; CHECK-NEXT:    [[A3:%.*]] = insertelement <4 x float> [[A0]], float 3.000000e+00, i32 3
+; CHECK-NEXT:    [[A3:%.*]] = insertelement <4 x float> <float undef, float undef, float undef, float 3.000000e+00>, float %f, i32 0
 ; CHECK-NEXT:    [[RET:%.*]] = shufflevector <4 x float> [[A3]], <4 x float> <float undef, float 4.000000e+00, float 5.000000e+00, float undef>, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
 ; CHECK-NEXT:    ret <4 x float> [[RET]]
 ;
diff --git a/test/Transforms/InstCombine/vec_sext.ll b/test/Transforms/InstCombine/vec_sext.ll
index 10947c1781e0..79a32d64b063 100644
--- a/test/Transforms/InstCombine/vec_sext.ll
+++ b/test/Transforms/InstCombine/vec_sext.ll
@@ -6,7 +6,7 @@ define <4 x i32> @psignd_3(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw <4 x i32> zeroinitializer, %a
 ; CHECK-NEXT:    [[B_LOBIT:%.*]] = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
 ; CHECK-NEXT:    [[T1:%.*]] = xor <4 x i32> [[B_LOBIT]], <i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK-NEXT:    [[T2:%.*]] = and <4 x i32> %a, [[T1]]
+; CHECK-NEXT:    [[T2:%.*]] = and <4 x i32> [[T1]], %a
 ; CHECK-NEXT:    [[T3:%.*]] = and <4 x i32> [[B_LOBIT]], [[SUB]]
 ; CHECK-NEXT:    [[COND:%.*]] = or <4 x i32> [[T2]], [[T3]]
 ; CHECK-NEXT:    ret <4 x i32> [[COND]]
diff --git a/test/Transforms/InstCombine/vector-casts.ll b/test/Transforms/InstCombine/vector-casts.ll
index 8d01cad4b453..643ab6c5348f 100644
--- a/test/Transforms/InstCombine/vector-casts.ll
+++ b/test/Transforms/InstCombine/vector-casts.ll
@@ -110,23 +110,11 @@ define <2 x i64> @bar(<2 x i65> %t) {
   ret <2 x i64> %b
 }
 
-define <2 x i65> @foos(<2 x i64> %t) {
-; CHECK-LABEL: @foos(
-; CHECK-NEXT:    [[A:%.*]] = zext <2 x i64> %t to <2 x i65>
-; CHECK-NEXT:    [[SEXT:%.*]] = shl <2 x i65> [[A]], <i65 33, i65 33>
-; CHECK-NEXT:    [[B:%.*]] = ashr <2 x i65> [[SEXT]], <i65 33, i65 33>
-; CHECK-NEXT:    ret <2 x i65> [[B]]
-;
-  %a = trunc <2 x i64> %t to <2 x i32>
-  %b = sext <2 x i32> %a to <2 x i65>
-  ret <2 x i65> %b
-}
-
 define <2 x i64> @bars(<2 x i65> %t) {
 ; CHECK-LABEL: @bars(
 ; CHECK-NEXT:    [[A:%.*]] = trunc <2 x i65> %t to <2 x i64>
 ; CHECK-NEXT:    [[SEXT:%.*]] = shl <2 x i64> [[A]], <i64 32, i64 32>
-; CHECK-NEXT:    [[B:%.*]] = ashr <2 x i64> [[SEXT]], <i64 32, i64 32>
+; CHECK-NEXT:    [[B:%.*]] = ashr exact <2 x i64> [[SEXT]], <i64 32, i64 32>
 ; CHECK-NEXT:    ret <2 x i64> [[B]]
 ;
   %a = trunc <2 x i65> %t to <2 x i32>
@@ -137,7 +125,7 @@ define <2 x i64> @bars(<2 x i65> %t) {
 define <2 x i64> @quxs(<2 x i64> %t) {
 ; CHECK-LABEL: @quxs(
 ; CHECK-NEXT:    [[SEXT:%.*]] = shl <2 x i64> %t, <i64 32, i64 32>
-; CHECK-NEXT:    [[B:%.*]] = ashr <2 x i64> [[SEXT]], <i64 32, i64 32>
+; CHECK-NEXT:    [[B:%.*]] = ashr exact <2 x i64> [[SEXT]], <i64 32, i64 32>
 ; CHECK-NEXT:    ret <2 x i64> [[B]]
 ;
   %a = trunc <2 x i64> %t to <2 x i32>
@@ -148,7 +136,7 @@ define <2 x i64> @quxs(<2 x i64> %t) {
 define <2 x i64> @quxt(<2 x i64> %t) {
 ; CHECK-LABEL: @quxt(
 ; CHECK-NEXT:    [[A:%.*]] = shl <2 x i64> %t, <i64 32, i64 32>
-; CHECK-NEXT:    [[B:%.*]] = ashr <2 x i64> [[A]], <i64 32, i64 32>
+; CHECK-NEXT:    [[B:%.*]] = ashr exact <2 x i64> [[A]], <i64 32, i64 32>
 ; CHECK-NEXT:    ret <2 x i64> [[B]]
 ;
   %a = shl <2 x i64> %t, <i64 32, i64 32>
@@ -228,3 +216,91 @@ define <8 x i32> @pr24458(<8 x float> %n) {
   ret <8 x i32> %wrong
 }
 
+; Hoist a trunc to a scalar if we're inserting into an undef vector.
+; trunc (inselt undef, X, Index) --> inselt undef, (trunc X), Index
+
+define <3 x i16> @trunc_inselt_undef(i32 %x) {
+; CHECK-LABEL: @trunc_inselt_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 %x to i16
+; CHECK-NEXT:    [[TRUNC:%.*]] = insertelement <3 x i16> undef, i16 [[TMP1]], i32 1
+; CHECK-NEXT:    ret <3 x i16> [[TRUNC]]
+;
+  %vec = insertelement <3 x i32> undef, i32 %x, i32 1
+  %trunc = trunc <3 x i32> %vec to <3 x i16>
+  ret <3 x i16> %trunc
+}
+
+; Hoist a trunc to a scalar if we're inserting into an undef vector.
+; trunc (inselt undef, X, Index) --> inselt undef, (trunc X), Index
+
+define <2 x float> @fptrunc_inselt_undef(double %x, i32 %index) {
+; CHECK-LABEL: @fptrunc_inselt_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc double %x to float
+; CHECK-NEXT:    [[TRUNC:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 %index
+; CHECK-NEXT:    ret <2 x float> [[TRUNC]]
+;
+  %vec = insertelement <2 x double> <double undef, double undef>, double %x, i32 %index
+  %trunc = fptrunc <2 x double> %vec to <2 x float>
+  ret <2 x float> %trunc
+}
+
+; TODO: Strengthen the backend, so we can have this canonicalization.
+; Insert a scalar int into a constant vector and truncate:
+; trunc (inselt C, X, Index) --> inselt C, (trunc X), Index
+
+define <3 x i16> @trunc_inselt1(i32 %x) {
+; CHECK-LABEL: @trunc_inselt1(
+; CHECK-NEXT:    [[VEC:%.*]] = insertelement <3 x i32> <i32 3, i32 undef, i32 65536>, i32 %x, i32 1
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <3 x i32> [[VEC]] to <3 x i16>
+; CHECK-NEXT:    ret <3 x i16> [[TRUNC]]
+;
+  %vec = insertelement <3 x i32> <i32 3, i32 -2, i32 65536>, i32 %x, i32 1
+  %trunc = trunc <3 x i32> %vec to <3 x i16>
+  ret <3 x i16> %trunc
+}
+
+; TODO: Strengthen the backend, so we can have this canonicalization.
+; Insert a scalar FP into a constant vector and FP truncate:
+; fptrunc (inselt C, X, Index) --> inselt C, (fptrunc X), Index
+
+define <2 x float> @fptrunc_inselt1(double %x, i32 %index) {
+; CHECK-LABEL: @fptrunc_inselt1(
+; CHECK-NEXT:    [[VEC:%.*]] = insertelement <2 x double> <double undef, double 3.000000e+00>, double %x, i32 %index
+; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc <2 x double> [[VEC]] to <2 x float>
+; CHECK-NEXT:    ret <2 x float> [[TRUNC]]
+;
+  %vec = insertelement <2 x double> <double undef, double 3.0>, double %x, i32 %index
+  %trunc = fptrunc <2 x double> %vec to <2 x float>
+  ret <2 x float> %trunc
+}
+
+; TODO: Strengthen the backend, so we can have this canonicalization.
+; Insert a scalar int constant into a vector and truncate:
+; trunc (inselt X, C, Index) --> inselt (trunc X), C', Index
+
+define <8 x i16> @trunc_inselt2(<8 x i32> %x, i32 %index) {
+; CHECK-LABEL: @trunc_inselt2(
+; CHECK-NEXT:    [[VEC:%.*]] = insertelement <8 x i32> %x, i32 1048576, i32 %index
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <8 x i32> [[VEC]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[TRUNC]]
+;
+  %vec = insertelement <8 x i32> %x, i32 1048576, i32 %index
+  %trunc = trunc <8 x i32> %vec to <8 x i16>
+  ret <8 x i16> %trunc
+}
+
+; TODO: Strengthen the backend, so we can have this canonicalization.
+; Insert a scalar FP constant into a vector and FP truncate:
+; fptrunc (inselt X, C, Index) --> inselt (fptrunc X), C', Index
+
+define <3 x float> @fptrunc_inselt2(<3 x double> %x) {
+; CHECK-LABEL: @fptrunc_inselt2(
+; CHECK-NEXT:    [[VEC:%.*]] = insertelement <3 x double> %x, double 4.000000e+00, i32 2
+; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc <3 x double> [[VEC]] to <3 x float>
+; CHECK-NEXT:    ret <3 x float> [[TRUNC]]
+;
+  %vec = insertelement <3 x double> %x, double 4.0, i32 2
+  %trunc = fptrunc <3 x double> %vec to <3 x float>
+  ret <3 x float> %trunc
+}
+
diff --git a/test/Transforms/InstCombine/vector-srem.ll b/test/Transforms/InstCombine/vector-srem.ll
deleted file mode 100644
index 44b38596e684..000000000000
--- a/test/Transforms/InstCombine/vector-srem.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-define <4 x i32> @foo(<4 x i32> %t, <4 x i32> %u) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:    [[K:%.*]] = srem <4 x i32> %t, %u
-; CHECK-NEXT:    ret <4 x i32> [[K]]
-;
-  %k = sdiv <4 x i32> %t, %u
-  %l = mul <4 x i32> %k, %u
-  %m = sub <4 x i32> %t, %l
-  ret <4 x i32> %m
-}
diff --git a/test/Transforms/InstCombine/vector-urem.ll b/test/Transforms/InstCombine/vector-urem.ll
index 6cecc16069d3..34eebeef3bb1 100644
--- a/test/Transforms/InstCombine/vector-urem.ll
+++ b/test/Transforms/InstCombine/vector-urem.ll
@@ -19,11 +19,3 @@ define <4 x i32> @test_v4i32_const_pow2(<4 x i32> %a0) {
   ret <4 x i32> %1
 }
 
-define <4 x i32> @test_v4i32_const_pow2_or_zero(<4 x i32> %a0) {
-; CHECK-LABEL: @test_v4i32_const_pow2_or_zero(
-; CHECK-NEXT:    [[TMP1:%.*]] = urem <4 x i32> %a0, <i32 1, i32 2, i32 0, i32 8>
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %1 = urem <4 x i32> %a0, <i32 1, i32 2, i32 0, i32 8>
-  ret <4 x i32> %1
-}
diff --git a/test/Transforms/InstCombine/vector_insertelt_shuffle.ll b/test/Transforms/InstCombine/vector_insertelt_shuffle.ll
index b3e614653cfa..c358509d690e 100644
--- a/test/Transforms/InstCombine/vector_insertelt_shuffle.ll
+++ b/test/Transforms/InstCombine/vector_insertelt_shuffle.ll
@@ -1,94 +1,95 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-define<4 x float> @foo(<4 x float> %x) {
+; insertelements should fold to shuffle
+define <4 x float> @foo(<4 x float> %x) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[INS2:%.*]] = shufflevector <4 x float> %x, <4 x float> <float undef, float 1.000000e+00, float 2.000000e+00, float undef>, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[INS2]]
+;
   %ins1 = insertelement<4 x float> %x, float 1.0, i32 1
   %ins2 = insertelement<4 x float> %ins1, float 2.0, i32 2
-  ret<4 x float> %ins2
+  ret <4 x float> %ins2
 }
 
-; insertelements should fold to shuffle
-; CHECK-LABEL: @foo
-; CHECK-NEXT: shufflevector <4 x float> %{{.+}}, <4 x float> <float undef, float 1.000000e+00, float 2.000000e+00, float undef>, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; CHECK-NEXT: ret <4 x float> %
+; Insert of a constant is canonicalized ahead of insert of a variable.
 
-define<4 x float> @bar(<4 x float> %x, float %a) {
+define <4 x float> @bar(<4 x float> %x, float %a) {
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> %x, float 2.000000e+00, i32 2
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x float> [[TMP1]], float %a, i32 1
+; CHECK-NEXT:    ret <4 x float> [[INS2]]
+;
   %ins1 = insertelement<4 x float> %x, float %a, i32 1
   %ins2 = insertelement<4 x float> %ins1, float 2.0, i32 2
-  ret<4 x float> %ins2
+  ret <4 x float> %ins2
 }
 
-; CHECK-LABEL: @bar
-; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float %{{.+}}, i32 1
-; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float 2.000000e+00, i32 2
-; CHECK-NEXT: ret <4 x float> %
-
-define<4 x float> @baz(<4 x float> %x, i32 %a) {
+define <4 x float> @baz(<4 x float> %x, i32 %a) {
+; CHECK-LABEL: @baz(
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <4 x float> %x, float 1.000000e+00, i32 1
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x float> [[INS1]], float 2.000000e+00, i32 %a
+; CHECK-NEXT:    ret <4 x float> [[INS2]]
+;
   %ins1 = insertelement<4 x float> %x, float 1.0, i32 1
   %ins2 = insertelement<4 x float> %ins1, float 2.0, i32 %a
-  ret<4 x float> %ins2
+  ret <4 x float> %ins2
 }
 
-; CHECK-LABEL: @baz
-; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float 1.000000e+00, i32 1
-; CHECK-NEXT: insertelement <4 x float> %ins1, float 2.000000e+00, i32 %
-; CHECK-NEXT: ret <4 x float> %
-
-define<4 x float> @bazz(<4 x float> %x, i32 %a) {
+; insertelements should fold to shuffle
+define <4 x float> @bazz(<4 x float> %x, i32 %a) {
+; CHECK-LABEL: @bazz(
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <4 x float> %x, float 1.000000e+00, i32 3
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x float> [[INS1]], float 5.000000e+00, i32 %a
+; CHECK-NEXT:    [[INS5:%.*]] = shufflevector <4 x float> [[INS2]], <4 x float> <float undef, float 1.000000e+00, float 2.000000e+00, float undef>, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[INS6:%.*]] = insertelement <4 x float> [[INS5]], float 7.000000e+00, i32 %a
+; CHECK-NEXT:    ret <4 x float> [[INS6]]
+;
   %ins1 = insertelement<4 x float> %x, float 1.0, i32 3
   %ins2 = insertelement<4 x float> %ins1, float 5.0, i32 %a
   %ins3 = insertelement<4 x float> %ins2, float 3.0, i32 2
   %ins4 = insertelement<4 x float> %ins3, float 1.0, i32 1
   %ins5 = insertelement<4 x float> %ins4, float 2.0, i32 2
   %ins6 = insertelement<4 x float> %ins5, float 7.0, i32 %a
-  ret<4 x float> %ins6
+  ret <4 x float> %ins6
 }
 
-; insertelements should fold to shuffle
-; CHECK-LABEL: @bazz
-; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float 1.000000e+00, i32 3
-; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float 5.000000e+00, i32 %
-; CHECK-NEXT: shufflevector <4 x float> %{{.+}}, <4 x float> <float undef, float 1.000000e+00, float 2.000000e+00, float undef>, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float 7.000000e+00, i32 %
-; CHECK-NEXT: ret <4 x float> %
-
-define<4 x float> @bazzz(<4 x float> %x) {
+define <4 x float> @bazzz(<4 x float> %x) {
+; CHECK-LABEL: @bazzz(
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x float> %x, float 2.000000e+00, i32 2
+; CHECK-NEXT:    ret <4 x float> [[INS2]]
+;
   %ins1 = insertelement<4 x float> %x, float 1.0, i32 5
   %ins2 = insertelement<4 x float> %ins1, float 2.0, i32 2
-  ret<4 x float> %ins2
+  ret <4 x float> %ins2
 }
 
-; CHECK-LABEL: @bazzz
-; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float 2.000000e+00, i32 2
-; CHECK-NEXT: ret <4 x float> %
-
-define<4 x float> @bazzzz(<4 x float> %x) {
+define <4 x float> @bazzzz(<4 x float> %x) {
+; CHECK-LABEL: @bazzzz(
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <4 x float> %x, float 1.000000e+00, i32 undef
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x float> %x, float 2.000000e+00, i32 2
+; CHECK-NEXT:    ret <4 x float> [[INS2]]
+;
   %ins1 = insertelement<4 x float> %x, float 1.0, i32 undef
   %ins2 = insertelement<4 x float> %ins1, float 2.0, i32 2
-  ret<4 x float> %ins2
+  ret <4 x float> %ins2
 }
 
-; CHECK-LABEL: @bazzzz
-; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float 1.000000e+00, i32 undef
-; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float 2.000000e+00, i32 2
-; CHECK-NEXT: ret <4 x float> %
-
-define<4 x float> @bazzzzz() {
+define <4 x float> @bazzzzz() {
+; CHECK-LABEL: @bazzzzz(
+; CHECK-NEXT:    ret <4 x float> <float 1.000000e+00, float 5.000000e+00, float 1.000000e+01, float 4.000000e+00>
+;
   %ins1 = insertelement <4 x float> insertelement (<4 x float> <float 1.0, float 2.0, float 3.0, float undef>, float 4.0, i32 3), float 5.0, i32 1
   %ins2 = insertelement<4 x float> %ins1, float 10.0, i32 2
-  ret<4 x float> %ins2
+  ret <4 x float> %ins2
 }
 
-; insertelements should fold to shuffle
-; CHECK-LABEL: @bazzzzz
-; CHECK-NEXT: ret <4 x float> <float 1.000000e+00, float 5.000000e+00, float 1.000000e+01, float 4.000000e+00>
-
-define<4 x float> @bazzzzzz(<4 x float> %x, i32 %a) {
+define <4 x float> @bazzzzzz(<4 x float> %x, i32 %a) {
+; CHECK-LABEL: @bazzzzzz(
+; CHECK-NEXT:    ret <4 x float> <float undef, float 5.000000e+00, float undef, float 4.000000e+00>
+;
   %ins1 = insertelement <4 x float> insertelement (<4 x float> shufflevector (<4 x float> undef, <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0> , <4 x i32> <i32 0, i32 5, i32 undef, i32 6> ), float 4.0, i32 3), float 5.0, i32 1
-  ret<4 x float> %ins1
+  ret <4 x float> %ins1
 }
 
-; insertelements should fold to shuffle
-; CHECK-LABEL: @bazzzzz
-; CHECK-NEXT: ret <4 x float> <float undef, float 5.000000e+00, float undef, float 4.000000e+00>
 
diff --git a/test/Transforms/InstCombine/win-math.ll b/test/Transforms/InstCombine/win-math.ll
index e6e79e2b84a0..36947791393d 100644
--- a/test/Transforms/InstCombine/win-math.ll
+++ b/test/Transforms/InstCombine/win-math.ll
@@ -56,15 +56,15 @@ declare double @ceil(double %x)
 define float @float_ceil(float %x) nounwind readnone {
 ; WIN32-LABEL: @float_ceil(
 ; WIN32-NOT: float @ceilf
-; WIN32: double @ceil
+; WIN32: float @llvm.ceil.f32
 ; WIN64-LABEL: @float_ceil(
-; WIN64: float @ceilf
+; WIN64: float @llvm.ceil.f32
 ; WIN64-NOT: double @ceil
 ; MINGW32-LABEL: @float_ceil(
-; MINGW32: float @ceilf
+; MINGW32: float @llvm.ceil.f32
 ; MINGW32-NOT: double @ceil
 ; MINGW64-LABEL: @float_ceil(
-; MINGW64: float @ceilf
+; MINGW64: float @llvm.ceil.f32
 ; MINGW64-NOT: double @ceil
     %1 = fpext float %x to double
     %2 = call double @ceil(double %1)
@@ -137,15 +137,15 @@ declare double @floor(double %x)
 define float @float_floor(float %x) nounwind readnone {
 ; WIN32-LABEL: @float_floor(
 ; WIN32-NOT: float @floorf
-; WIN32: double @floor
+; WIN32: float @llvm.floor.f32
 ; WIN64-LABEL: @float_floor(
-; WIN64: float @floorf
+; WIN64: float @llvm.floor.f32
 ; WIN64-NOT: double @floor
 ; MINGW32-LABEL: @float_floor(
-; MINGW32: float @floorf
+; MINGW32: float @llvm.floor.f32
 ; MINGW32-NOT: double @floor
 ; MINGW64-LABEL: @float_floor(
-; MINGW64: float @floorf
+; MINGW64: float @llvm.floor.f32
 ; MINGW64-NOT: double @floor
     %1 = fpext float %x to double
     %2 = call double @floor(double %1)
@@ -262,10 +262,10 @@ define float @float_round(float %x) nounwind readnone {
 ; WIN64-NOT: float @roundf
 ; WIN64: double @round
 ; MINGW32-LABEL: @float_round(
-; MINGW32: float @roundf
+; MINGW32: float @llvm.round.f32
 ; MINGW32-NOT: double @round
 ; MINGW64-LABEL: @float_round(
-; MINGW64: float @roundf
+; MINGW64: float @llvm.round.f32
 ; MINGW64-NOT: double @round
     %1 = fpext float %x to double
     %2 = call double @round(double %1)
@@ -274,21 +274,26 @@ define float @float_round(float %x) nounwind readnone {
 }
 
 declare float @powf(float, float)
-; win32 lacks sqrtf&fabsf, win64 lacks fabsf
+
+; win32 lacks sqrtf&fabsf, win64 lacks fabsf, but
+; calls to the intrinsics can be emitted instead.
 define float @float_powsqrt(float %x) nounwind readnone {
 ; WIN32-LABEL: @float_powsqrt(
 ; WIN32-NOT: float @sqrtf
 ; WIN32: float @powf
+
 ; WIN64-LABEL: @float_powsqrt(
-; WIN64-NOT: float @sqrtf
-; WIN64: float @powf
+; WIN64: float @sqrtf
+; WIN64: float @llvm.fabs.f32(
+; WIN64-NOT: float @powf
+
 ; MINGW32-LABEL: @float_powsqrt(
 ; MINGW32: float @sqrtf
-; MINGW32: float @fabsf
+; MINGW32: float @llvm.fabs.f32
 ; MINGW32-NOT: float @powf
 ; MINGW64-LABEL: @float_powsqrt(
 ; MINGW64: float @sqrtf
-; MINGW64: float @fabsf
+; MINGW64: float @llvm.fabs.f32(
 ; MINGW64-NOT: float @powf
     %1 = call float @powf(float %x, float 0.5)
     ret float %1
diff --git a/test/Transforms/InstCombine/x86-avx2.ll b/test/Transforms/InstCombine/x86-avx2.ll
index 4c13b4c6ae74..f4045f788e2d 100644
--- a/test/Transforms/InstCombine/x86-avx2.ll
+++ b/test/Transforms/InstCombine/x86-avx2.ll
@@ -81,5 +81,29 @@ define <8 x float> @undef_test_vpermps(<8 x float> %a0) {
   ret <8 x float> %a
 }
 
+; Verify simplify demanded elts.
+
+define <8 x i32> @elts_test_vpermd(<8 x i32> %a0, i32 %a1) {
+; CHECK-LABEL: @elts_test_vpermd(
+; CHECK-NEXT:    ret <8 x i32> %a0
+;
+  %1 = insertelement <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, i32 %a1, i32 0
+  %2 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %1)
+  %3 = shufflevector <8 x i32> %2, <8 x i32> undef, <8 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %3
+}
+
+define <8 x float> @elts_test_vpermps(<8 x float> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @elts_test_vpermps(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %a1)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x float> [[TMP2]]
+;
+  %1 = insertelement <8 x i32> %a1, i32 0, i32 7
+  %2 = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %1)
+  %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> zeroinitializer
+  ret <8 x float> %3
+}
+
 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)
 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>)
diff --git a/test/Transforms/InstCombine/x86-avx512.ll b/test/Transforms/InstCombine/x86-avx512.ll
index d2a2580d8c24..2a24d93ce76a 100644
--- a/test/Transforms/InstCombine/x86-avx512.ll
+++ b/test/Transforms/InstCombine/x86-avx512.ll
@@ -6,10 +6,10 @@ declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>,
 
 define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @test_add_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> %a, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> %b, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> %a, float [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
 ; CHECK-NEXT:    ret <4 x float> [[TMP4]]
 ;
   %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
@@ -21,7 +21,7 @@ define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @test_add_ss_round(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @test_add_ss_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
@@ -33,14 +33,14 @@ define <4 x float> @test_add_ss_round(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @test_add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_add_ss_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> %a, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> %b, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 %mask to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> %c, i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> %a, float [[TMP7]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0
 ; CHECK-NEXT:    ret <4 x float> [[TMP8]]
 ;
   %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
@@ -52,7 +52,7 @@ define <4 x float> @test_add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float>
 
 define <4 x float> @test_add_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_add_ss_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
@@ -83,10 +83,10 @@ declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x doubl
 
 define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @test_add_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> %a, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> %b, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> %a, double [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
 ; CHECK-NEXT:    ret <2 x double> [[TMP4]]
 ;
   %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
@@ -96,7 +96,7 @@ define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
 
 define <2 x double> @test_add_sd_round(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @test_add_sd_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> undef, i8 -1, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
@@ -106,14 +106,14 @@ define <2 x double> @test_add_sd_round(<2 x double> %a, <2 x double> %b) {
 
 define <2 x double> @test_add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_add_sd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> %a, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> %b, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 %mask to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> %c, i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> %a, double [[TMP7]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0
 ; CHECK-NEXT:    ret <2 x double> [[TMP8]]
 ;
   %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
@@ -123,7 +123,7 @@ define <2 x double> @test_add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou
 
 define <2 x double> @test_add_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_add_sd_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
@@ -148,10 +148,10 @@ declare <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float>, <4 x float>,
 
 define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @test_sub_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> %a, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> %b, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = fsub float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> %a, float [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
 ; CHECK-NEXT:    ret <4 x float> [[TMP4]]
 ;
   %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
@@ -163,7 +163,7 @@ define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @test_sub_ss_round(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @test_sub_ss_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
@@ -175,14 +175,14 @@ define <4 x float> @test_sub_ss_round(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @test_sub_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_sub_ss_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> %a, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> %b, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = fsub float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 %mask to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> %c, i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> %a, float [[TMP7]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0
 ; CHECK-NEXT:    ret <4 x float> [[TMP8]]
 ;
   %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
@@ -194,7 +194,7 @@ define <4 x float> @test_sub_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float>
 
 define <4 x float> @test_sub_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_sub_ss_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
@@ -225,10 +225,10 @@ declare <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double>, <2 x doubl
 
 define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @test_sub_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> %a, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> %b, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = fsub double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> %a, double [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
 ; CHECK-NEXT:    ret <2 x double> [[TMP4]]
 ;
   %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
@@ -238,7 +238,7 @@ define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
 
 define <2 x double> @test_sub_sd_round(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @test_sub_sd_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> undef, i8 -1, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
@@ -248,14 +248,14 @@ define <2 x double> @test_sub_sd_round(<2 x double> %a, <2 x double> %b) {
 
 define <2 x double> @test_sub_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_sub_sd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> %a, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> %b, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = fsub double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 %mask to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> %c, i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> %a, double [[TMP7]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0
 ; CHECK-NEXT:    ret <2 x double> [[TMP8]]
 ;
   %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
@@ -265,7 +265,7 @@ define <2 x double> @test_sub_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou
 
 define <2 x double> @test_sub_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_sub_sd_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
@@ -290,10 +290,10 @@ declare <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float>, <4 x float>,
 
 define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @test_mul_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> %a, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> %b, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> %a, float [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
 ; CHECK-NEXT:    ret <4 x float> [[TMP4]]
 ;
   %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
@@ -305,7 +305,7 @@ define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @test_mul_ss_round(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @test_mul_ss_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
@@ -317,14 +317,14 @@ define <4 x float> @test_mul_ss_round(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @test_mul_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mul_ss_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> %a, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> %b, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 %mask to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> %c, i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> %a, float [[TMP7]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0
 ; CHECK-NEXT:    ret <4 x float> [[TMP8]]
 ;
   %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
@@ -336,7 +336,7 @@ define <4 x float> @test_mul_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float>
 
 define <4 x float> @test_mul_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mul_ss_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
@@ -367,10 +367,10 @@ declare <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double>, <2 x doubl
 
 define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @test_mul_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> %a, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> %b, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> %a, double [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
 ; CHECK-NEXT:    ret <2 x double> [[TMP4]]
 ;
   %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
@@ -380,7 +380,7 @@ define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
 
 define <2 x double> @test_mul_sd_round(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @test_mul_sd_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> undef, i8 -1, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
@@ -390,14 +390,14 @@ define <2 x double> @test_mul_sd_round(<2 x double> %a, <2 x double> %b) {
 
 define <2 x double> @test_mul_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mul_sd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> %a, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> %b, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 %mask to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> %c, i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> %a, double [[TMP7]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0
 ; CHECK-NEXT:    ret <2 x double> [[TMP8]]
 ;
   %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
@@ -407,7 +407,7 @@ define <2 x double> @test_mul_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou
 
 define <2 x double> @test_mul_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mul_sd_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
@@ -432,10 +432,10 @@ declare <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float>, <4 x float>,
 
 define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @test_div_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> %a, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> %b, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = fdiv float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> %a, float [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
 ; CHECK-NEXT:    ret <4 x float> [[TMP4]]
 ;
   %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
@@ -447,7 +447,7 @@ define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @test_div_ss_round(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @test_div_ss_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
@@ -459,14 +459,14 @@ define <4 x float> @test_div_ss_round(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @test_div_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_div_ss_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> %a, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> %b, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = fdiv float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 %mask to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> %c, i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> %a, float [[TMP7]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0
 ; CHECK-NEXT:    ret <4 x float> [[TMP8]]
 ;
   %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
@@ -478,7 +478,7 @@ define <4 x float> @test_div_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float>
 
 define <4 x float> @test_div_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_div_ss_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
@@ -509,10 +509,10 @@ declare <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double>, <2 x doubl
 
 define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @test_div_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> %a, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> %b, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = fdiv double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> %a, double [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
 ; CHECK-NEXT:    ret <2 x double> [[TMP4]]
 ;
   %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
@@ -522,7 +522,7 @@ define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
 
 define <2 x double> @test_div_sd_round(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @test_div_sd_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> undef, i8 -1, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
@@ -532,14 +532,14 @@ define <2 x double> @test_div_sd_round(<2 x double> %a, <2 x double> %b) {
 
 define <2 x double> @test_div_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_div_sd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> %a, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> %b, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = fdiv double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 %mask to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> %c, i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> %a, double [[TMP7]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0
 ; CHECK-NEXT:    ret <2 x double> [[TMP8]]
 ;
   %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
@@ -549,7 +549,7 @@ define <2 x double> @test_div_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou
 
 define <2 x double> @test_div_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_div_sd_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
@@ -574,7 +574,7 @@ declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>,
 
 define <4 x float> @test_max_ss(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @test_max_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 4)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
@@ -586,7 +586,7 @@ define <4 x float> @test_max_ss(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @test_max_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_max_ss_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
@@ -617,7 +617,7 @@ declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x doubl
 
 define <2 x double> @test_max_sd(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @test_max_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> undef, i8 -1, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 4)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
@@ -627,7 +627,7 @@ define <2 x double> @test_max_sd(<2 x double> %a, <2 x double> %b) {
 
 define <2 x double> @test_max_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_max_sd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
@@ -652,7 +652,7 @@ declare <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float>, <4 x float>,
 
 define <4 x float> @test_min_ss(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @test_min_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 4)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
@@ -664,7 +664,7 @@ define <4 x float> @test_min_ss(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @test_min_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_min_ss_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
@@ -695,7 +695,7 @@ declare <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double>, <2 x doubl
 
 define <2 x double> @test_min_sd(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @test_min_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> undef, i8 -1, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 4)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
@@ -705,7 +705,7 @@ define <2 x double> @test_min_sd(<2 x double> %a, <2 x double> %b) {
 
 define <2 x double> @test_min_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_min_sd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
@@ -730,7 +730,7 @@ declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
 
 define i8 @test_cmp_ss(<4 x float> %a, <4 x float> %b, i8 %mask) {
 ; CHECK-LABEL: @test_cmp_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %a, <4 x float> %b, i32 3, i8 %mask, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 3, i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    ret i8 [[TMP1]]
 ;
   %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
@@ -747,7 +747,7 @@ declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32
 
 define i8 @test_cmp_sd(<2 x double> %a, <2 x double> %b, i8 %mask) {
 ; CHECK-LABEL: @test_cmp_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %a, <2 x double> %b, i32 3, i8 %mask, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 3, i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    ret i8 [[TMP1]]
 ;
   %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
@@ -758,22 +758,22 @@ define i8 @test_cmp_sd(<2 x double> %a, <2 x double> %b, i8 %mask) {
 
 define i64 @test(float %f, double %d) {
 ; CHECK-LABEL: @test(
-; CHECK-NEXT:    [[V00:%.*]] = insertelement <4 x float> undef, float %f, i32 0
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[V00]], i32 4)
-; CHECK-NEXT:    [[V10:%.*]] = insertelement <4 x float> undef, float %f, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> [[V10]], i32 4)
-; CHECK-NEXT:    [[V20:%.*]] = insertelement <4 x float> undef, float %f, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.x86.avx512.cvttss2si(<4 x float> [[V20]], i32 4)
-; CHECK-NEXT:    [[V30:%.*]] = insertelement <4 x float> undef, float %f, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> [[V30]], i32 4)
-; CHECK-NEXT:    [[V40:%.*]] = insertelement <2 x double> undef, double %d, i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> [[V40]], i32 4)
-; CHECK-NEXT:    [[V50:%.*]] = insertelement <2 x double> undef, double %d, i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> [[V50]], i32 4)
-; CHECK-NEXT:    [[V60:%.*]] = insertelement <2 x double> undef, double %d, i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> [[V60]], i32 4)
-; CHECK-NEXT:    [[V70:%.*]] = insertelement <2 x double> undef, double %d, i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = tail call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> [[V70]], i32 4)
+; CHECK-NEXT:    [[V03:%.*]] = insertelement <4 x float> undef, float [[F:%.*]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[V03]], i32 4)
+; CHECK-NEXT:    [[V13:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> [[V13]], i32 4)
+; CHECK-NEXT:    [[V23:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.x86.avx512.cvttss2si(<4 x float> [[V23]], i32 4)
+; CHECK-NEXT:    [[V33:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> [[V33]], i32 4)
+; CHECK-NEXT:    [[V41:%.*]] = insertelement <2 x double> undef, double [[D:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> [[V41]], i32 4)
+; CHECK-NEXT:    [[V51:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> [[V51]], i32 4)
+; CHECK-NEXT:    [[V61:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> [[V61]], i32 4)
+; CHECK-NEXT:    [[V71:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> [[V71]], i32 4)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP4]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP9]]
@@ -781,7 +781,7 @@ define i64 @test(float %f, double %d) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP5]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP11]], [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP11]]
 ; CHECK-NEXT:    ret i64 [[TMP15]]
 ;
   %v00 = insertelement <4 x float> undef, float %f, i32 0
@@ -838,22 +838,22 @@ declare i64 @llvm.x86.avx512.cvttsd2si64(<2 x double>, i32)
 
 define i64 @test2(float %f, double %d) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[V00:%.*]] = insertelement <4 x float> undef, float %f, i32 0
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[V00]], i32 4)
-; CHECK-NEXT:    [[V10:%.*]] = insertelement <4 x float> undef, float %f, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> [[V10]], i32 4)
-; CHECK-NEXT:    [[V20:%.*]] = insertelement <4 x float> undef, float %f, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> [[V20]], i32 4)
-; CHECK-NEXT:    [[V30:%.*]] = insertelement <4 x float> undef, float %f, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> [[V30]], i32 4)
-; CHECK-NEXT:    [[V40:%.*]] = insertelement <2 x double> undef, double %d, i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> [[V40]], i32 4)
-; CHECK-NEXT:    [[V50:%.*]] = insertelement <2 x double> undef, double %d, i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> [[V50]], i32 4)
-; CHECK-NEXT:    [[V60:%.*]] = insertelement <2 x double> undef, double %d, i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> [[V60]], i32 4)
-; CHECK-NEXT:    [[V70:%.*]] = insertelement <2 x double> undef, double %d, i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = tail call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> [[V70]], i32 4)
+; CHECK-NEXT:    [[V03:%.*]] = insertelement <4 x float> undef, float [[F:%.*]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[V03]], i32 4)
+; CHECK-NEXT:    [[V13:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> [[V13]], i32 4)
+; CHECK-NEXT:    [[V23:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> [[V23]], i32 4)
+; CHECK-NEXT:    [[V33:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> [[V33]], i32 4)
+; CHECK-NEXT:    [[V41:%.*]] = insertelement <2 x double> undef, double [[D:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> [[V41]], i32 4)
+; CHECK-NEXT:    [[V51:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> [[V51]], i32 4)
+; CHECK-NEXT:    [[V61:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> [[V61]], i32 4)
+; CHECK-NEXT:    [[V71:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> [[V71]], i32 4)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP4]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP9]]
@@ -861,7 +861,7 @@ define i64 @test2(float %f, double %d) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP5]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP11]], [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP11]]
 ; CHECK-NEXT:    ret i64 [[TMP15]]
 ;
   %v00 = insertelement <4 x float> undef, float %f, i32 0
@@ -920,8 +920,8 @@ declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4
 
 define <4 x float> @test_mask_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mask_vfmadd_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
   %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
   %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
@@ -935,7 +935,7 @@ define <4 x float> @test_mask_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x flo
 
 define float @test_mask_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mask_vfmadd_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    ret float [[TMP2]]
 ;
@@ -963,8 +963,8 @@ declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>,
 
 define <2 x double> @test_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mask_vfmadd_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
   %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1
@@ -974,7 +974,7 @@ define <2 x double> @test_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x
 
 define double @test_mask_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mask_vfmadd_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    ret double [[TMP2]]
 ;
@@ -998,8 +998,8 @@ declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <
 
 define <4 x float> @test_maskz_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_maskz_vfmadd_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
   %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
   %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
@@ -1013,7 +1013,7 @@ define <4 x float> @test_maskz_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x fl
 
 define float @test_maskz_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_maskz_vfmadd_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    ret float [[TMP2]]
 ;
@@ -1041,8 +1041,8 @@ declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double>, <2 x double>
 
 define <2 x double> @test_maskz_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_maskz_vfmadd_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
   %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1
@@ -1052,7 +1052,7 @@ define <2 x double> @test_maskz_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x
 
 define double @test_maskz_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_maskz_vfmadd_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    ret double [[TMP2]]
 ;
@@ -1076,8 +1076,8 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <
 
 define <4 x float> @test_mask3_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mask3_vfmadd_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
   %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
   %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
@@ -1091,7 +1091,7 @@ define <4 x float> @test_mask3_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x fl
 
 define float @test_mask3_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mask3_vfmadd_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    ret float [[TMP2]]
 ;
@@ -1119,8 +1119,8 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>
 
 define <2 x double> @test_mask3_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mask3_vfmadd_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
   %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
@@ -1130,7 +1130,7 @@ define <2 x double> @test_mask3_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x
 
 define double @test_mask3_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mask3_vfmadd_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    ret double [[TMP2]]
 ;
@@ -1154,8 +1154,8 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float>, <4 x float>, <
 
 define <4 x float> @test_mask3_vfmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mask3_vfmsub_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
   %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
   %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
@@ -1169,7 +1169,7 @@ define <4 x float> @test_mask3_vfmsub_ss(<4 x float> %a, <4 x float> %b, <4 x fl
 
 define float @test_mask3_vfmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mask3_vfmsub_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    ret float [[TMP2]]
 ;
@@ -1197,8 +1197,8 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double>
 
 define <2 x double> @test_mask3_vfmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mask3_vfmsub_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
   %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
@@ -1208,7 +1208,7 @@ define <2 x double> @test_mask3_vfmsub_sd(<2 x double> %a, <2 x double> %b, <2 x
 
 define double @test_mask3_vfmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mask3_vfmsub_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    ret double [[TMP2]]
 ;
@@ -1232,8 +1232,8 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float>, <4 x float>,
 
 define <4 x float> @test_mask3_vfnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mask3_vfnmsub_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
   %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
   %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
@@ -1247,7 +1247,7 @@ define <4 x float> @test_mask3_vfnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x f
 
 define float @test_mask3_vfnmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mask3_vfnmsub_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    ret float [[TMP2]]
 ;
@@ -1275,8 +1275,8 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double>, <2 x double
 
 define <2 x double> @test_mask3_vfnmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mask3_vfnmsub_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
   %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
@@ -1286,7 +1286,7 @@ define <2 x double> @test_mask3_vfnmsub_sd(<2 x double> %a, <2 x double> %b, <2
 
 define double @test_mask3_vfnmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mask3_vfnmsub_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    ret double [[TMP2]]
 ;
@@ -1310,7 +1310,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32>, <8 x i32>, <8
 
 define <8 x i32> @identity_test_permvar_si_256(<8 x i32> %a0) {
 ; CHECK-LABEL: @identity_test_permvar_si_256(
-; CHECK-NEXT:    ret <8 x i32> %a0
+; CHECK-NEXT:    ret <8 x i32> [[A0:%.*]]
 ;
   %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> undef, i8 -1)
   ret <8 x i32> %a
@@ -1318,8 +1318,8 @@ define <8 x i32> @identity_test_permvar_si_256(<8 x i32> %a0) {
 
 define <8 x i32> @identity_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
 ; CHECK-LABEL: @identity_test_permvar_si_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> %a0, <8 x i32> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> [[A0:%.*]], <8 x i32> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
 ;
   %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> %passthru, i8 %mask)
@@ -1328,7 +1328,7 @@ define <8 x i32> @identity_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %pa
 
 define <8 x i32> @zero_test_permvar_si_256(<8 x i32> %a0) {
 ; CHECK-LABEL: @zero_test_permvar_si_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
   %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> zeroinitializer, <8 x i32> undef, i8 -1)
@@ -1337,9 +1337,9 @@ define <8 x i32> @zero_test_permvar_si_256(<8 x i32> %a0) {
 
 define <8 x i32> @zero_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
 ; CHECK-LABEL: @zero_test_permvar_si_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> zeroinitializer, <8 x i32> %passthru, i8 %mask)
@@ -1348,7 +1348,7 @@ define <8 x i32> @zero_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passth
 
 define <8 x i32> @shuffle_test_permvar_si_256(<8 x i32> %a0) {
 ; CHECK-LABEL: @shuffle_test_permvar_si_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
   %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x i32> undef, i8 -1)
@@ -1357,9 +1357,9 @@ define <8 x i32> @shuffle_test_permvar_si_256(<8 x i32> %a0) {
 
 define <8 x i32> @shuffle_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
 ; CHECK-LABEL: @shuffle_test_permvar_si_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x i32> %passthru, i8 %mask)
@@ -1368,7 +1368,7 @@ define <8 x i32> @shuffle_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %pas
 
 define <8 x i32> @undef_test_permvar_si_256(<8 x i32> %a0) {
 ; CHECK-LABEL: @undef_test_permvar_si_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
   %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x i32> undef, i8 -1)
@@ -1377,9 +1377,9 @@ define <8 x i32> @undef_test_permvar_si_256(<8 x i32> %a0) {
 
 define <8 x i32> @undef_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
 ; CHECK-LABEL: @undef_test_permvar_si_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x i32> %passthru, i8 %mask)
@@ -1390,7 +1390,7 @@ declare <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float>, <8 x i32>,
 
 define <8 x float> @identity_test_permvar_sf_256(<8 x float> %a0) {
 ; CHECK-LABEL: @identity_test_permvar_sf_256(
-; CHECK-NEXT:    ret <8 x float> %a0
+; CHECK-NEXT:    ret <8 x float> [[A0:%.*]]
 ;
   %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x float> undef, i8 -1)
   ret <8 x float> %a
@@ -1398,8 +1398,8 @@ define <8 x float> @identity_test_permvar_sf_256(<8 x float> %a0) {
 
 define <8 x float> @identity_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
 ; CHECK-LABEL: @identity_test_permvar_sf_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x float> %a0, <8 x float> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[A0:%.*]], <8 x float> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <8 x float> [[TMP2]]
 ;
   %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x float> %passthru, i8 %mask)
@@ -1408,7 +1408,7 @@ define <8 x float> @identity_test_permvar_sf_256_mask(<8 x float> %a0, <8 x floa
 
 define <8 x float> @zero_test_permvar_sf_256(<8 x float> %a0) {
 ; CHECK-LABEL: @zero_test_permvar_sf_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <8 x float> [[TMP1]]
 ;
   %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> zeroinitializer, <8 x float> undef, i8 -1)
@@ -1417,9 +1417,9 @@ define <8 x float> @zero_test_permvar_sf_256(<8 x float> %a0) {
 
 define <8 x float> @zero_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
 ; CHECK-LABEL: @zero_test_permvar_sf_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> zeroinitializer, <8 x float> %passthru, i8 %mask)
@@ -1428,7 +1428,7 @@ define <8 x float> @zero_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %
 
 define <8 x float> @shuffle_test_permvar_sf_256(<8 x float> %a0) {
 ; CHECK-LABEL: @shuffle_test_permvar_sf_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x float> [[TMP1]]
 ;
   %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x float> undef, i8 -1)
@@ -1437,9 +1437,9 @@ define <8 x float> @shuffle_test_permvar_sf_256(<8 x float> %a0) {
 
 define <8 x float> @shuffle_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
 ; CHECK-LABEL: @shuffle_test_permvar_sf_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x float> %passthru, i8 %mask)
@@ -1448,7 +1448,7 @@ define <8 x float> @shuffle_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float
 
 define <8 x float> @undef_test_permvar_sf_256(<8 x float> %a0) {
 ; CHECK-LABEL: @undef_test_permvar_sf_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x float> [[TMP1]]
 ;
   %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x float> undef, i8 -1)
@@ -1457,9 +1457,9 @@ define <8 x float> @undef_test_permvar_sf_256(<8 x float> %a0) {
 
 define <8 x float> @undef_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
 ; CHECK-LABEL: @undef_test_permvar_sf_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x float> %passthru, i8 %mask)
@@ -1470,7 +1470,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64>, <4 x i64>, <4
 
 define <4 x i64> @identity_test_permvar_di_256(<4 x i64> %a0) {
 ; CHECK-LABEL: @identity_test_permvar_di_256(
-; CHECK-NEXT:    ret <4 x i64> %a0
+; CHECK-NEXT:    ret <4 x i64> [[A0:%.*]]
 ;
   %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>, <4 x i64> undef, i8 -1)
   ret <4 x i64> %a
@@ -1478,10 +1478,10 @@ define <4 x i64> @identity_test_permvar_di_256(<4 x i64> %a0) {
 
 define <4 x i64> @identity_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
 ; CHECK-LABEL: @identity_test_permvar_di_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> %a0, <4 x i64> %passthru
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[A0:%.*]], <4 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP2]]
 ;
   %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>, <4 x i64> %passthru, i8 %mask)
   ret <4 x i64> %a
@@ -1489,7 +1489,7 @@ define <4 x i64> @identity_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %pa
 
 define <4 x i64> @zero_test_permvar_di_256(<4 x i64> %a0) {
 ; CHECK-LABEL: @zero_test_permvar_di_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
 ;
   %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> zeroinitializer, <4 x i64> undef, i8 -1)
@@ -1498,11 +1498,11 @@ define <4 x i64> @zero_test_permvar_di_256(<4 x i64> %a0) {
 
 define <4 x i64> @zero_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
 ; CHECK-LABEL: @zero_test_permvar_di_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i64> [[TMP1]], <4 x i64> %passthru
-; CHECK-NEXT:    ret <4 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> zeroinitializer, <4 x i64> %passthru, i8 %mask)
   ret <4 x i64> %a
@@ -1510,7 +1510,7 @@ define <4 x i64> @zero_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passth
 
 define <4 x i64> @shuffle_test_permvar_di_256(<4 x i64> %a0) {
 ; CHECK-LABEL: @shuffle_test_permvar_di_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
 ;
   %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>, <4 x i64> undef, i8 -1)
@@ -1519,11 +1519,11 @@ define <4 x i64> @shuffle_test_permvar_di_256(<4 x i64> %a0) {
 
 define <4 x i64> @shuffle_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
 ; CHECK-LABEL: @shuffle_test_permvar_di_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i64> [[TMP1]], <4 x i64> %passthru
-; CHECK-NEXT:    ret <4 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>, <4 x i64> %passthru, i8 %mask)
   ret <4 x i64> %a
@@ -1531,7 +1531,7 @@ define <4 x i64> @shuffle_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %pas
 
 define <4 x i64> @undef_test_permvar_di_256(<4 x i64> %a0) {
 ; CHECK-LABEL: @undef_test_permvar_di_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
 ;
   %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>, <4 x i64> undef, i8 -1)
@@ -1540,11 +1540,11 @@ define <4 x i64> @undef_test_permvar_di_256(<4 x i64> %a0) {
 
 define <4 x i64> @undef_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
 ; CHECK-LABEL: @undef_test_permvar_di_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i64> [[TMP1]], <4 x i64> %passthru
-; CHECK-NEXT:    ret <4 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>, <4 x i64> %passthru, i8 %mask)
   ret <4 x i64> %a
@@ -1554,7 +1554,7 @@ declare <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double>, <4 x i64
 
 define <4 x double> @identity_test_permvar_df_256(<4 x double> %a0) {
 ; CHECK-LABEL: @identity_test_permvar_df_256(
-; CHECK-NEXT:    ret <4 x double> %a0
+; CHECK-NEXT:    ret <4 x double> [[A0:%.*]]
 ;
   %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>, <4 x double> undef, i8 -1)
   ret <4 x double> %a
@@ -1562,10 +1562,10 @@ define <4 x double> @identity_test_permvar_df_256(<4 x double> %a0) {
 
 define <4 x double> @identity_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
 ; CHECK-LABEL: @identity_test_permvar_df_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x double> %a0, <4 x double> %passthru
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[A0:%.*]], <4 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x double> [[TMP2]]
 ;
   %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>, <4 x double> %passthru, i8 %mask)
   ret <4 x double> %a
@@ -1573,7 +1573,7 @@ define <4 x double> @identity_test_permvar_df_256_mask(<4 x double> %a0, <4 x do
 
 define <4 x double> @zero_test_permvar_df_256(<4 x double> %a0) {
 ; CHECK-LABEL: @zero_test_permvar_df_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <4 x double> [[TMP1]]
 ;
   %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> zeroinitializer, <4 x double> undef, i8 -1)
@@ -1582,11 +1582,11 @@ define <4 x double> @zero_test_permvar_df_256(<4 x double> %a0) {
 
 define <4 x double> @zero_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
 ; CHECK-LABEL: @zero_test_permvar_df_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP1]], <4 x double> %passthru
-; CHECK-NEXT:    ret <4 x double> [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> zeroinitializer, <4 x double> %passthru, i8 %mask)
   ret <4 x double> %a
@@ -1594,7 +1594,7 @@ define <4 x double> @zero_test_permvar_df_256_mask(<4 x double> %a0, <4 x double
 
 define <4 x double> @shuffle_test_permvar_df_256(<4 x double> %a0) {
 ; CHECK-LABEL: @shuffle_test_permvar_df_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <4 x double> [[TMP1]]
 ;
   %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>, <4 x double> undef, i8 -1)
@@ -1603,11 +1603,11 @@ define <4 x double> @shuffle_test_permvar_df_256(<4 x double> %a0) {
 
 define <4 x double> @shuffle_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
 ; CHECK-LABEL: @shuffle_test_permvar_df_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP1]], <4 x double> %passthru
-; CHECK-NEXT:    ret <4 x double> [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>, <4 x double> %passthru, i8 %mask)
   ret <4 x double> %a
@@ -1615,7 +1615,7 @@ define <4 x double> @shuffle_test_permvar_df_256_mask(<4 x double> %a0, <4 x dou
 
 define <4 x double> @undef_test_permvar_df_256(<4 x double> %a0) {
 ; CHECK-LABEL: @undef_test_permvar_df_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <4 x double> [[TMP1]]
 ;
   %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>, <4 x double> undef, i8 -1)
@@ -1624,11 +1624,11 @@ define <4 x double> @undef_test_permvar_df_256(<4 x double> %a0) {
 
 define <4 x double> @undef_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
 ; CHECK-LABEL: @undef_test_permvar_df_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP1]], <4 x double> %passthru
-; CHECK-NEXT:    ret <4 x double> [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>, <4 x double> %passthru, i8 %mask)
   ret <4 x double> %a
@@ -1638,7 +1638,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>,
 
 define <16 x i32> @identity_test_permvar_si_512(<16 x i32> %a0) {
 ; CHECK-LABEL: @identity_test_permvar_si_512(
-; CHECK-NEXT:    ret <16 x i32> %a0
+; CHECK-NEXT:    ret <16 x i32> [[A0:%.*]]
 ;
   %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, <16 x i32> undef, i16 -1)
   ret <16 x i32> %a
@@ -1646,8 +1646,8 @@ define <16 x i32> @identity_test_permvar_si_512(<16 x i32> %a0) {
 
 define <16 x i32> @identity_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
 ; CHECK-LABEL: @identity_test_permvar_si_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 %mask to <16 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i32> %a0, <16 x i32> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i32> [[A0:%.*]], <16 x i32> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
 ;
   %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, <16 x i32> %passthru, i16 %mask)
@@ -1656,7 +1656,7 @@ define <16 x i32> @identity_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32>
 
 define <16 x i32> @zero_test_permvar_si_512(<16 x i32> %a0) {
 ; CHECK-LABEL: @zero_test_permvar_si_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
 ;
   %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> zeroinitializer, <16 x i32> undef, i16 -1)
@@ -1665,9 +1665,9 @@ define <16 x i32> @zero_test_permvar_si_512(<16 x i32> %a0) {
 
 define <16 x i32> @zero_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
 ; CHECK-LABEL: @zero_test_permvar_si_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 %mask to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
 ;
   %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> zeroinitializer, <16 x i32> %passthru, i16 %mask)
@@ -1676,7 +1676,7 @@ define <16 x i32> @zero_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %pas
 
 define <16 x i32> @shuffle_test_permvar_si_512(<16 x i32> %a0) {
 ; CHECK-LABEL: @shuffle_test_permvar_si_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
 ;
   %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> undef, i16 -1)
@@ -1685,9 +1685,9 @@ define <16 x i32> @shuffle_test_permvar_si_512(<16 x i32> %a0) {
 
 define <16 x i32> @shuffle_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
 ; CHECK-LABEL: @shuffle_test_permvar_si_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 %mask to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
 ;
   %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> %passthru, i16 %mask)
@@ -1696,7 +1696,7 @@ define <16 x i32> @shuffle_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %
 
 define <16 x i32> @undef_test_permvar_si_512(<16 x i32> %a0) {
 ; CHECK-LABEL: @undef_test_permvar_si_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
 ;
   %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> undef, i16 -1)
@@ -1705,9 +1705,9 @@ define <16 x i32> @undef_test_permvar_si_512(<16 x i32> %a0) {
 
 define <16 x i32> @undef_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
 ; CHECK-LABEL: @undef_test_permvar_si_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 %mask to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
 ;
   %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> %passthru, i16 %mask)
@@ -1718,7 +1718,7 @@ declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i3
 
 define <16 x float> @identity_test_permvar_sf_512(<16 x float> %a0) {
 ; CHECK-LABEL: @identity_test_permvar_sf_512(
-; CHECK-NEXT:    ret <16 x float> %a0
+; CHECK-NEXT:    ret <16 x float> [[A0:%.*]]
 ;
   %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, <16 x float> undef, i16 -1)
   ret <16 x float> %a
@@ -1726,8 +1726,8 @@ define <16 x float> @identity_test_permvar_sf_512(<16 x float> %a0) {
 
 define <16 x float> @identity_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
 ; CHECK-LABEL: @identity_test_permvar_sf_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 %mask to <16 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x float> %a0, <16 x float> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x float> [[A0:%.*]], <16 x float> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <16 x float> [[TMP2]]
 ;
   %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, <16 x float> %passthru, i16 %mask)
@@ -1736,7 +1736,7 @@ define <16 x float> @identity_test_permvar_sf_512_mask(<16 x float> %a0, <16 x f
 
 define <16 x float> @zero_test_permvar_sf_512(<16 x float> %a0) {
 ; CHECK-LABEL: @zero_test_permvar_sf_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> zeroinitializer, <16 x float> undef, i16 -1)
@@ -1745,9 +1745,9 @@ define <16 x float> @zero_test_permvar_sf_512(<16 x float> %a0) {
 
 define <16 x float> @zero_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
 ; CHECK-LABEL: @zero_test_permvar_sf_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 %mask to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <16 x float> [[TMP3]]
 ;
   %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> zeroinitializer, <16 x float> %passthru, i16 %mask)
@@ -1756,7 +1756,7 @@ define <16 x float> @zero_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float
 
 define <16 x float> @shuffle_test_permvar_sf_512(<16 x float> %a0) {
 ; CHECK-LABEL: @shuffle_test_permvar_sf_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> undef, i16 -1)
@@ -1765,9 +1765,9 @@ define <16 x float> @shuffle_test_permvar_sf_512(<16 x float> %a0) {
 
 define <16 x float> @shuffle_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
 ; CHECK-LABEL: @shuffle_test_permvar_sf_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 %mask to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <16 x float> [[TMP3]]
 ;
   %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %passthru, i16 %mask)
@@ -1776,7 +1776,7 @@ define <16 x float> @shuffle_test_permvar_sf_512_mask(<16 x float> %a0, <16 x fl
 
 define <16 x float> @undef_test_permvar_sf_512(<16 x float> %a0) {
 ; CHECK-LABEL: @undef_test_permvar_sf_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> undef, i16 -1)
@@ -1785,9 +1785,9 @@ define <16 x float> @undef_test_permvar_sf_512(<16 x float> %a0) {
 
 define <16 x float> @undef_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
 ; CHECK-LABEL: @undef_test_permvar_sf_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 %mask to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <16 x float> [[TMP3]]
 ;
   %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %passthru, i16 %mask)
@@ -1798,7 +1798,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8
 
 define <8 x i64> @identity_test_permvar_di_512(<8 x i64> %a0) {
 ; CHECK-LABEL: @identity_test_permvar_di_512(
-; CHECK-NEXT:    ret <8 x i64> %a0
+; CHECK-NEXT:    ret <8 x i64> [[A0:%.*]]
 ;
   %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, <8 x i64> undef, i8 -1)
   ret <8 x i64> %a
@@ -1806,8 +1806,8 @@ define <8 x i64> @identity_test_permvar_di_512(<8 x i64> %a0) {
 
 define <8 x i64> @identity_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
 ; CHECK-LABEL: @identity_test_permvar_di_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> %a0, <8 x i64> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> [[A0:%.*]], <8 x i64> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
 ;
   %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, <8 x i64> %passthru, i8 %mask)
@@ -1816,7 +1816,7 @@ define <8 x i64> @identity_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %pa
 
 define <8 x i64> @zero_test_permvar_di_512(<8 x i64> %a0) {
 ; CHECK-LABEL: @zero_test_permvar_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
 ;
   %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> zeroinitializer, <8 x i64> undef, i8 -1)
@@ -1825,9 +1825,9 @@ define <8 x i64> @zero_test_permvar_di_512(<8 x i64> %a0) {
 
 define <8 x i64> @zero_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
 ; CHECK-LABEL: @zero_test_permvar_di_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
 ;
   %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> zeroinitializer, <8 x i64> %passthru, i8 %mask)
@@ -1836,7 +1836,7 @@ define <8 x i64> @zero_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passth
 
 define <8 x i64> @shuffle_test_permvar_di_512(<8 x i64> %a0) {
 ; CHECK-LABEL: @shuffle_test_permvar_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
 ;
   %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> undef, i8 -1)
@@ -1845,9 +1845,9 @@ define <8 x i64> @shuffle_test_permvar_di_512(<8 x i64> %a0) {
 
 define <8 x i64> @shuffle_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
 ; CHECK-LABEL: @shuffle_test_permvar_di_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
 ;
   %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %passthru, i8 %mask)
@@ -1856,7 +1856,7 @@ define <8 x i64> @shuffle_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %pas
 
 define <8 x i64> @undef_test_permvar_di_512(<8 x i64> %a0) {
 ; CHECK-LABEL: @undef_test_permvar_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
 ;
   %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> undef, i8 -1)
@@ -1865,9 +1865,9 @@ define <8 x i64> @undef_test_permvar_di_512(<8 x i64> %a0) {
 
 define <8 x i64> @undef_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
 ; CHECK-LABEL: @undef_test_permvar_di_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
 ;
   %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %passthru, i8 %mask)
@@ -1878,7 +1878,7 @@ declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64
 
 define <8 x double> @identity_test_permvar_df_512(<8 x double> %a0) {
 ; CHECK-LABEL: @identity_test_permvar_df_512(
-; CHECK-NEXT:    ret <8 x double> %a0
+; CHECK-NEXT:    ret <8 x double> [[A0:%.*]]
 ;
   %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, <8 x double> undef, i8 -1)
   ret <8 x double> %a
@@ -1886,8 +1886,8 @@ define <8 x double> @identity_test_permvar_df_512(<8 x double> %a0) {
 
 define <8 x double> @identity_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
 ; CHECK-LABEL: @identity_test_permvar_df_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x double> %a0, <8 x double> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x double> [[A0:%.*]], <8 x double> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <8 x double> [[TMP2]]
 ;
   %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, <8 x double> %passthru, i8 %mask)
@@ -1896,7 +1896,7 @@ define <8 x double> @identity_test_permvar_df_512_mask(<8 x double> %a0, <8 x do
 
 define <8 x double> @zero_test_permvar_df_512(<8 x double> %a0) {
 ; CHECK-LABEL: @zero_test_permvar_df_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> zeroinitializer, <8 x double> undef, i8 -1)
@@ -1905,9 +1905,9 @@ define <8 x double> @zero_test_permvar_df_512(<8 x double> %a0) {
 
 define <8 x double> @zero_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
 ; CHECK-LABEL: @zero_test_permvar_df_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <8 x double> [[TMP3]]
 ;
   %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> zeroinitializer, <8 x double> %passthru, i8 %mask)
@@ -1916,7 +1916,7 @@ define <8 x double> @zero_test_permvar_df_512_mask(<8 x double> %a0, <8 x double
 
 define <8 x double> @shuffle_test_permvar_df_512(<8 x double> %a0) {
 ; CHECK-LABEL: @shuffle_test_permvar_df_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> undef, i8 -1)
@@ -1925,9 +1925,9 @@ define <8 x double> @shuffle_test_permvar_df_512(<8 x double> %a0) {
 
 define <8 x double> @shuffle_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
 ; CHECK-LABEL: @shuffle_test_permvar_df_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <8 x double> [[TMP3]]
 ;
   %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %passthru, i8 %mask)
@@ -1936,7 +1936,7 @@ define <8 x double> @shuffle_test_permvar_df_512_mask(<8 x double> %a0, <8 x dou
 
 define <8 x double> @undef_test_permvar_df_512(<8 x double> %a0) {
 ; CHECK-LABEL: @undef_test_permvar_df_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> undef, i8 -1)
@@ -1945,9 +1945,9 @@ define <8 x double> @undef_test_permvar_df_512(<8 x double> %a0) {
 
 define <8 x double> @undef_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
 ; CHECK-LABEL: @undef_test_permvar_df_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <8 x double> [[TMP3]]
 ;
   %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %passthru, i8 %mask)
@@ -1958,7 +1958,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16>, <8 x i16>, <8
 
 define <8 x i16> @identity_test_permvar_hi_128(<8 x i16> %a0) {
 ; CHECK-LABEL: @identity_test_permvar_hi_128(
-; CHECK-NEXT:    ret <8 x i16> %a0
+; CHECK-NEXT:    ret <8 x i16> [[A0:%.*]]
 ;
   %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, <8 x i16> undef, i8 -1)
   ret <8 x i16> %a
@@ -1966,8 +1966,8 @@ define <8 x i16> @identity_test_permvar_hi_128(<8 x i16> %a0) {
 
 define <8 x i16> @identity_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
 ; CHECK-LABEL: @identity_test_permvar_hi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> %a0, <8 x i16> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A0:%.*]], <8 x i16> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 ;
   %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, <8 x i16> %passthru, i8 %mask)
@@ -1976,7 +1976,7 @@ define <8 x i16> @identity_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %pa
 
 define <8 x i16> @zero_test_permvar_hi_128(<8 x i16> %a0) {
 ; CHECK-LABEL: @zero_test_permvar_hi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
 ;
   %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i16> undef, i8 -1)
@@ -1985,9 +1985,9 @@ define <8 x i16> @zero_test_permvar_hi_128(<8 x i16> %a0) {
 
 define <8 x i16> @zero_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
 ; CHECK-LABEL: @zero_test_permvar_hi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
 ;
   %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i16> %passthru, i8 %mask)
@@ -1996,7 +1996,7 @@ define <8 x i16> @zero_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passth
 
 define <8 x i16> @shuffle_test_permvar_hi_128(<8 x i16> %a0) {
 ; CHECK-LABEL: @shuffle_test_permvar_hi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
 ;
   %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <8 x i16> undef, i8 -1)
@@ -2005,9 +2005,9 @@ define <8 x i16> @shuffle_test_permvar_hi_128(<8 x i16> %a0) {
 
 define <8 x i16> @shuffle_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
 ; CHECK-LABEL: @shuffle_test_permvar_hi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
 ;
   %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <8 x i16> %passthru, i8 %mask)
@@ -2016,7 +2016,7 @@ define <8 x i16> @shuffle_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %pas
 
 define <8 x i16> @undef_test_permvar_hi_128(<8 x i16> %a0) {
 ; CHECK-LABEL: @undef_test_permvar_hi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
 ;
   %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 undef, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <8 x i16> undef, i8 -1)
@@ -2025,9 +2025,9 @@ define <8 x i16> @undef_test_permvar_hi_128(<8 x i16> %a0) {
 
 define <8 x i16> @undef_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
 ; CHECK-LABEL: @undef_test_permvar_hi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
 ;
   %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 undef, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <8 x i16> %passthru, i8 %mask)
@@ -2038,7 +2038,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16>, <16 x i16>,
 
 define <16 x i16> @identity_test_permvar_hi_256(<16 x i16> %a0) {
 ; CHECK-LABEL: @identity_test_permvar_hi_256(
-; CHECK-NEXT:    ret <16 x i16> %a0
+; CHECK-NEXT:    ret <16 x i16> [[A0:%.*]]
 ;
   %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, <16 x i16> undef, i16 -1)
   ret <16 x i16> %a
@@ -2046,8 +2046,8 @@ define <16 x i16> @identity_test_permvar_hi_256(<16 x i16> %a0) {
 
 define <16 x i16> @identity_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
 ; CHECK-LABEL: @identity_test_permvar_hi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 %mask to <16 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i16> %a0, <16 x i16> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i16> [[A0:%.*]], <16 x i16> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <16 x i16> [[TMP2]]
 ;
   %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, <16 x i16> %passthru, i16 %mask)
@@ -2056,7 +2056,7 @@ define <16 x i16> @identity_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16>
 
 define <16 x i16> @zero_test_permvar_hi_256(<16 x i16> %a0) {
 ; CHECK-LABEL: @zero_test_permvar_hi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
 ;
   %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> zeroinitializer, <16 x i16> undef, i16 -1)
@@ -2065,9 +2065,9 @@ define <16 x i16> @zero_test_permvar_hi_256(<16 x i16> %a0) {
 
 define <16 x i16> @zero_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
 ; CHECK-LABEL: @zero_test_permvar_hi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 %mask to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> zeroinitializer, <16 x i16> %passthru, i16 %mask)
@@ -2076,7 +2076,7 @@ define <16 x i16> @zero_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %pas
 
 define <16 x i16> @shuffle_test_permvar_hi_256(<16 x i16> %a0) {
 ; CHECK-LABEL: @shuffle_test_permvar_hi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
 ;
   %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <16 x i16> undef, i16 -1)
@@ -2085,9 +2085,9 @@ define <16 x i16> @shuffle_test_permvar_hi_256(<16 x i16> %a0) {
 
 define <16 x i16> @shuffle_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
 ; CHECK-LABEL: @shuffle_test_permvar_hi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 %mask to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <16 x i16> %passthru, i16 %mask)
@@ -2096,7 +2096,7 @@ define <16 x i16> @shuffle_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %
 
 define <16 x i16> @undef_test_permvar_hi_256(<16 x i16> %a0) {
 ; CHECK-LABEL: @undef_test_permvar_hi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
 ;
   %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 undef, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <16 x i16> undef, i16 -1)
@@ -2105,9 +2105,9 @@ define <16 x i16> @undef_test_permvar_hi_256(<16 x i16> %a0) {
 
 define <16 x i16> @undef_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
 ; CHECK-LABEL: @undef_test_permvar_hi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 %mask to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 undef, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <16 x i16> %passthru, i16 %mask)
@@ -2118,7 +2118,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16>, <32 x i16>,
 
 define <32 x i16> @identity_test_permvar_hi_512(<32 x i16> %a0) {
 ; CHECK-LABEL: @identity_test_permvar_hi_512(
-; CHECK-NEXT:    ret <32 x i16> %a0
+; CHECK-NEXT:    ret <32 x i16> [[A0:%.*]]
 ;
   %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31>, <32 x i16> undef, i32 -1)
   ret <32 x i16> %a
@@ -2126,8 +2126,8 @@ define <32 x i16> @identity_test_permvar_hi_512(<32 x i16> %a0) {
 
 define <32 x i16> @identity_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
 ; CHECK-LABEL: @identity_test_permvar_hi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 %mask to <32 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i16> %a0, <32 x i16> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i16> [[A0:%.*]], <32 x i16> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <32 x i16> [[TMP2]]
 ;
   %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31>, <32 x i16> %passthru, i32 %mask)
@@ -2136,7 +2136,7 @@ define <32 x i16> @identity_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16>
 
 define <32 x i16> @zero_test_permvar_hi_512(<32 x i16> %a0) {
 ; CHECK-LABEL: @zero_test_permvar_hi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
 ;
   %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> zeroinitializer, <32 x i16> undef, i32 -1)
@@ -2145,9 +2145,9 @@ define <32 x i16> @zero_test_permvar_hi_512(<32 x i16> %a0) {
 
 define <32 x i16> @zero_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
 ; CHECK-LABEL: @zero_test_permvar_hi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 %mask to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
 ;
   %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> zeroinitializer, <32 x i16> %passthru, i32 %mask)
@@ -2156,7 +2156,7 @@ define <32 x i16> @zero_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %pas
 
 define <32 x i16> @shuffle_test_permvar_hi_512(<32 x i16> %a0) {
 ; CHECK-LABEL: @shuffle_test_permvar_hi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
 ;
   %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> undef, i32 -1)
@@ -2165,9 +2165,9 @@ define <32 x i16> @shuffle_test_permvar_hi_512(<32 x i16> %a0) {
 
 define <32 x i16> @shuffle_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
 ; CHECK-LABEL: @shuffle_test_permvar_hi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 %mask to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
 ;
   %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %passthru, i32 %mask)
@@ -2176,7 +2176,7 @@ define <32 x i16> @shuffle_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %
 
 define <32 x i16> @undef_test_permvar_hi_512(<32 x i16> %a0) {
 ; CHECK-LABEL: @undef_test_permvar_hi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
 ;
   %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 undef, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> undef, i32 -1)
@@ -2185,9 +2185,9 @@ define <32 x i16> @undef_test_permvar_hi_512(<32 x i16> %a0) {
 
 define <32 x i16> @undef_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
 ; CHECK-LABEL: @undef_test_permvar_hi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 %mask to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
 ;
   %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 undef, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %passthru, i32 %mask)
@@ -2198,7 +2198,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8>, <16 x i8>, <16
 
 define <16 x i8> @identity_test_permvar_qi_128(<16 x i8> %a0) {
 ; CHECK-LABEL: @identity_test_permvar_qi_128(
-; CHECK-NEXT:    ret <16 x i8> %a0
+; CHECK-NEXT:    ret <16 x i8> [[A0:%.*]]
 ;
   %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, <16 x i8> undef, i16 -1)
   ret <16 x i8> %a
@@ -2206,8 +2206,8 @@ define <16 x i8> @identity_test_permvar_qi_128(<16 x i8> %a0) {
 
 define <16 x i8> @identity_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
 ; CHECK-LABEL: @identity_test_permvar_qi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 %mask to <16 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> %a0, <16 x i8> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A0:%.*]], <16 x i8> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 ;
   %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, <16 x i8> %passthru, i16 %mask)
@@ -2216,7 +2216,7 @@ define <16 x i8> @identity_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %pa
 
 define <16 x i8> @zero_test_permvar_qi_128(<16 x i8> %a0) {
 ; CHECK-LABEL: @zero_test_permvar_qi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
 ;
   %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i8> undef, i16 -1)
@@ -2225,9 +2225,9 @@ define <16 x i8> @zero_test_permvar_qi_128(<16 x i8> %a0) {
 
 define <16 x i8> @zero_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
 ; CHECK-LABEL: @zero_test_permvar_qi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 %mask to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
 ;
   %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i8> %passthru, i16 %mask)
@@ -2236,7 +2236,7 @@ define <16 x i8> @zero_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passth
 
 define <16 x i8> @shuffle_test_permvar_qi_128(<16 x i8> %a0) {
 ; CHECK-LABEL: @shuffle_test_permvar_qi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
 ;
   %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> undef, i16 -1)
@@ -2245,9 +2245,9 @@ define <16 x i8> @shuffle_test_permvar_qi_128(<16 x i8> %a0) {
 
 define <16 x i8> @shuffle_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
 ; CHECK-LABEL: @shuffle_test_permvar_qi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 %mask to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
 ;
   %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> %passthru, i16 %mask)
@@ -2256,7 +2256,7 @@ define <16 x i8> @shuffle_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %pas
 
 define <16 x i8> @undef_test_permvar_qi_128(<16 x i8> %a0) {
 ; CHECK-LABEL: @undef_test_permvar_qi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
 ;
   %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 undef, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> undef, i16 -1)
@@ -2265,9 +2265,9 @@ define <16 x i8> @undef_test_permvar_qi_128(<16 x i8> %a0) {
 
 define <16 x i8> @undef_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
 ; CHECK-LABEL: @undef_test_permvar_qi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 %mask to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
 ;
   %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 undef, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> %passthru, i16 %mask)
@@ -2278,7 +2278,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8>, <32 x i8>, <32
 
 define <32 x i8> @identity_test_permvar_qi_256(<32 x i8> %a0) {
 ; CHECK-LABEL: @identity_test_permvar_qi_256(
-; CHECK-NEXT:    ret <32 x i8> %a0
+; CHECK-NEXT:    ret <32 x i8> [[A0:%.*]]
 ;
   %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, <32 x i8> undef, i32 -1)
   ret <32 x i8> %a
@@ -2286,8 +2286,8 @@ define <32 x i8> @identity_test_permvar_qi_256(<32 x i8> %a0) {
 
 define <32 x i8> @identity_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
 ; CHECK-LABEL: @identity_test_permvar_qi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 %mask to <32 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i8> %a0, <32 x i8> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i8> [[A0:%.*]], <32 x i8> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <32 x i8> [[TMP2]]
 ;
   %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, <32 x i8> %passthru, i32 %mask)
@@ -2296,7 +2296,7 @@ define <32 x i8> @identity_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %pa
 
 define <32 x i8> @zero_test_permvar_qi_256(<32 x i8> %a0) {
 ; CHECK-LABEL: @zero_test_permvar_qi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
 ;
   %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i8> undef, i32 -1)
@@ -2305,9 +2305,9 @@ define <32 x i8> @zero_test_permvar_qi_256(<32 x i8> %a0) {
 
 define <32 x i8> @zero_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
 ; CHECK-LABEL: @zero_test_permvar_qi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 %mask to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
 ;
   %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i8> %passthru, i32 %mask)
@@ -2316,7 +2316,7 @@ define <32 x i8> @zero_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passth
 
 define <32 x i8> @shuffle_test_permvar_qi_256(<32 x i8> %a0) {
 ; CHECK-LABEL: @shuffle_test_permvar_qi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
 ;
   %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <32 x i8> undef, i32 -1)
@@ -2325,9 +2325,9 @@ define <32 x i8> @shuffle_test_permvar_qi_256(<32 x i8> %a0) {
 
 define <32 x i8> @shuffle_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
 ; CHECK-LABEL: @shuffle_test_permvar_qi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 %mask to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
 ;
   %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <32 x i8> %passthru, i32 %mask)
@@ -2336,7 +2336,7 @@ define <32 x i8> @shuffle_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %pas
 
 define <32 x i8> @undef_test_permvar_qi_256(<32 x i8> %a0) {
 ; CHECK-LABEL: @undef_test_permvar_qi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
 ;
   %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 undef, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <32 x i8> undef, i32 -1)
@@ -2345,9 +2345,9 @@ define <32 x i8> @undef_test_permvar_qi_256(<32 x i8> %a0) {
 
 define <32 x i8> @undef_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
 ; CHECK-LABEL: @undef_test_permvar_qi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 %mask to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
 ;
   %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 undef, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <32 x i8> %passthru, i32 %mask)
@@ -2358,7 +2358,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8>, <64 x i8>, <64
 
 define <64 x i8> @identity_test_permvar_qi_512(<64 x i8> %a0) {
 ; CHECK-LABEL: @identity_test_permvar_qi_512(
-; CHECK-NEXT:    ret <64 x i8> %a0
+; CHECK-NEXT:    ret <64 x i8> [[A0:%.*]]
 ;
   %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63>, <64 x i8> undef, i64 -1)
   ret <64 x i8> %a
@@ -2366,8 +2366,8 @@ define <64 x i8> @identity_test_permvar_qi_512(<64 x i8> %a0) {
 
 define <64 x i8> @identity_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
 ; CHECK-LABEL: @identity_test_permvar_qi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 %mask to <64 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <64 x i1> [[TMP1]], <64 x i8> %a0, <64 x i8> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <64 x i1> [[TMP1]], <64 x i8> [[A0:%.*]], <64 x i8> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <64 x i8> [[TMP2]]
 ;
   %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63>, <64 x i8> %passthru, i64 %mask)
@@ -2376,7 +2376,7 @@ define <64 x i8> @identity_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %pa
 
 define <64 x i8> @zero_test_permvar_qi_512(<64 x i8> %a0) {
 ; CHECK-LABEL: @zero_test_permvar_qi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
 ;
   %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> zeroinitializer, <64 x i8> undef, i64 -1)
@@ -2385,9 +2385,9 @@ define <64 x i8> @zero_test_permvar_qi_512(<64 x i8> %a0) {
 
 define <64 x i8> @zero_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
 ; CHECK-LABEL: @zero_test_permvar_qi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 %mask to <64 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
 ;
   %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> zeroinitializer, <64 x i8> %passthru, i64 %mask)
@@ -2396,7 +2396,7 @@ define <64 x i8> @zero_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passth
 
 define <64 x i8> @shuffle_test_permvar_qi_512(<64 x i8> %a0) {
 ; CHECK-LABEL: @shuffle_test_permvar_qi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
 ;
   %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 63, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> undef, i64 -1)
@@ -2405,9 +2405,9 @@ define <64 x i8> @shuffle_test_permvar_qi_512(<64 x i8> %a0) {
 
 define <64 x i8> @shuffle_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
 ; CHECK-LABEL: @shuffle_test_permvar_qi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 %mask to <64 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
 ;
   %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 63, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> %passthru, i64 %mask)
@@ -2416,7 +2416,7 @@ define <64 x i8> @shuffle_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %pas
 
 define <64 x i8> @undef_test_permvar_qi_512(<64 x i8> %a0) {
 ; CHECK-LABEL: @undef_test_permvar_qi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 undef, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> <i32 undef, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
 ;
   %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 undef, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> undef, i64 -1)
@@ -2425,9 +2425,9 @@ define <64 x i8> @undef_test_permvar_qi_512(<64 x i8> %a0) {
 
 define <64 x i8> @undef_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
 ; CHECK-LABEL: @undef_test_permvar_qi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 undef, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 %mask to <64 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> %passthru
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> <i32 undef, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]]
 ; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
 ;
   %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 undef, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> %passthru, i64 %mask)
@@ -2438,7 +2438,7 @@ declare <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float>, <16 x float>
 
 define <16 x float> @test_add_ps(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @test_add_ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %1 = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 4)
@@ -2447,7 +2447,7 @@ define <16 x float> @test_add_ps(<16 x float> %a, <16 x float> %b) {
 
 define <16 x float> @test_add_ps_round(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @test_add_ps_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> undef, i16 -1, i32 8)
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %1 = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 8)
@@ -2456,9 +2456,9 @@ define <16 x float> @test_add_ps_round(<16 x float> %a, <16 x float> %b) {
 
 define <16 x float> @test_add_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
 ; CHECK-LABEL: @test_add_ps_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> %a, %b
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 %mask to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> %c
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
 ; CHECK-NEXT:    ret <16 x float> [[TMP3]]
 ;
   %1 = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4)
@@ -2467,7 +2467,7 @@ define <16 x float> @test_add_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl
 
 define <16 x float> @test_add_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
 ; CHECK-LABEL: @test_add_ps_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> [[C:%.*]], i16 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %1 = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 8)
@@ -2478,7 +2478,7 @@ declare <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double>, <8 x double>
 
 define <8 x double> @test_add_pd(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @test_add_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %1 = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 4)
@@ -2487,7 +2487,7 @@ define <8 x double> @test_add_pd(<8 x double> %a, <8 x double> %b) {
 
 define <8 x double> @test_add_pd_round(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @test_add_pd_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %1 = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 8)
@@ -2496,9 +2496,9 @@ define <8 x double> @test_add_pd_round(<8 x double> %a, <8 x double> %b) {
 
 define <8 x double> @test_add_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_add_pd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> %a, %b
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> %c
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
 ; CHECK-NEXT:    ret <8 x double> [[TMP3]]
 ;
   %1 = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 4)
@@ -2507,7 +2507,7 @@ define <8 x double> @test_add_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou
 
 define <8 x double> @test_add_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_add_pd_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %1 = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 8)
@@ -2518,7 +2518,7 @@ declare <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float>, <16 x float>
 
 define <16 x float> @test_sub_ps(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @test_sub_ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %1 = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 4)
@@ -2527,7 +2527,7 @@ define <16 x float> @test_sub_ps(<16 x float> %a, <16 x float> %b) {
 
 define <16 x float> @test_sub_ps_round(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @test_sub_ps_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> undef, i16 -1, i32 8)
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %1 = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 8)
@@ -2536,9 +2536,9 @@ define <16 x float> @test_sub_ps_round(<16 x float> %a, <16 x float> %b) {
 
 define <16 x float> @test_sub_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
 ; CHECK-LABEL: @test_sub_ps_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> %a, %b
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 %mask to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> %c
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
 ; CHECK-NEXT:    ret <16 x float> [[TMP3]]
 ;
   %1 = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4)
@@ -2547,7 +2547,7 @@ define <16 x float> @test_sub_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl
 
 define <16 x float> @test_sub_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
 ; CHECK-LABEL: @test_sub_ps_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> [[C:%.*]], i16 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %1 = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 8)
@@ -2558,7 +2558,7 @@ declare <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double>, <8 x double>
 
 define <8 x double> @test_sub_pd(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @test_sub_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %1 = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 4)
@@ -2567,7 +2567,7 @@ define <8 x double> @test_sub_pd(<8 x double> %a, <8 x double> %b) {
 
 define <8 x double> @test_sub_pd_round(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @test_sub_pd_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %1 = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 8)
@@ -2576,9 +2576,9 @@ define <8 x double> @test_sub_pd_round(<8 x double> %a, <8 x double> %b) {
 
 define <8 x double> @test_sub_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_sub_pd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> %a, %b
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> %c
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
 ; CHECK-NEXT:    ret <8 x double> [[TMP3]]
 ;
   %1 = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 4)
@@ -2587,7 +2587,7 @@ define <8 x double> @test_sub_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou
 
 define <8 x double> @test_sub_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_sub_pd_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %1 = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 8)
@@ -2598,7 +2598,7 @@ declare <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float>, <16 x float>
 
 define <16 x float> @test_mul_ps(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @test_mul_ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %1 = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 4)
@@ -2607,7 +2607,7 @@ define <16 x float> @test_mul_ps(<16 x float> %a, <16 x float> %b) {
 
 define <16 x float> @test_mul_ps_round(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @test_mul_ps_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> undef, i16 -1, i32 8)
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %1 = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 8)
@@ -2616,9 +2616,9 @@ define <16 x float> @test_mul_ps_round(<16 x float> %a, <16 x float> %b) {
 
 define <16 x float> @test_mul_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
 ; CHECK-LABEL: @test_mul_ps_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> %a, %b
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 %mask to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> %c
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
 ; CHECK-NEXT:    ret <16 x float> [[TMP3]]
 ;
   %1 = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4)
@@ -2627,7 +2627,7 @@ define <16 x float> @test_mul_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl
 
 define <16 x float> @test_mul_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
 ; CHECK-LABEL: @test_mul_ps_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> [[C:%.*]], i16 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %1 = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 8)
@@ -2638,7 +2638,7 @@ declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>
 
 define <8 x double> @test_mul_pd(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @test_mul_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %1 = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 4)
@@ -2647,7 +2647,7 @@ define <8 x double> @test_mul_pd(<8 x double> %a, <8 x double> %b) {
 
 define <8 x double> @test_mul_pd_round(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @test_mul_pd_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %1 = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 8)
@@ -2656,9 +2656,9 @@ define <8 x double> @test_mul_pd_round(<8 x double> %a, <8 x double> %b) {
 
 define <8 x double> @test_mul_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mul_pd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> %a, %b
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> %c
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
 ; CHECK-NEXT:    ret <8 x double> [[TMP3]]
 ;
   %1 = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 4)
@@ -2667,7 +2667,7 @@ define <8 x double> @test_mul_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou
 
 define <8 x double> @test_mul_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mul_pd_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %1 = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 8)
@@ -2678,7 +2678,7 @@ declare <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float>, <16 x float>
 
 define <16 x float> @test_div_ps(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @test_div_ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %1 = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 4)
@@ -2687,7 +2687,7 @@ define <16 x float> @test_div_ps(<16 x float> %a, <16 x float> %b) {
 
 define <16 x float> @test_div_ps_round(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @test_div_ps_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> undef, i16 -1, i32 8)
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %1 = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 8)
@@ -2696,9 +2696,9 @@ define <16 x float> @test_div_ps_round(<16 x float> %a, <16 x float> %b) {
 
 define <16 x float> @test_div_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
 ; CHECK-LABEL: @test_div_ps_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> %a, %b
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 %mask to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> %c
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
 ; CHECK-NEXT:    ret <16 x float> [[TMP3]]
 ;
   %1 = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4)
@@ -2707,7 +2707,7 @@ define <16 x float> @test_div_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl
 
 define <16 x float> @test_div_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
 ; CHECK-LABEL: @test_div_ps_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> [[C:%.*]], i16 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %1 = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 8)
@@ -2718,7 +2718,7 @@ declare <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double>, <8 x double>
 
 define <8 x double> @test_div_pd(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @test_div_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %1 = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 4)
@@ -2727,7 +2727,7 @@ define <8 x double> @test_div_pd(<8 x double> %a, <8 x double> %b) {
 
 define <8 x double> @test_div_pd_round(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @test_div_pd_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %1 = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 8)
@@ -2736,9 +2736,9 @@ define <8 x double> @test_div_pd_round(<8 x double> %a, <8 x double> %b) {
 
 define <8 x double> @test_div_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_div_pd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> %a, %b
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 %mask to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> %c
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
 ; CHECK-NEXT:    ret <8 x double> [[TMP3]]
 ;
   %1 = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 4)
@@ -2747,7 +2747,7 @@ define <8 x double> @test_div_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou
 
 define <8 x double> @test_div_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_div_pd_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %1 = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 8)
@@ -2758,8 +2758,8 @@ declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
 
 define i32 @test_comi_ss_0(float %a, float %b) {
 ; CHECK-LABEL: @test_comi_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]], i32 0, i32 4)
 ; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
@@ -2779,8 +2779,8 @@ declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
 
 define i32 @test_comi_sd_0(double %a, double %b) {
 ; CHECK-LABEL: @test_comi_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]], i32 0, i32 4)
 ; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
diff --git a/test/Transforms/InstCombine/x86-muldq.ll b/test/Transforms/InstCombine/x86-muldq.ll
index 8b14a781f091..bcbb8919c403 100644
--- a/test/Transforms/InstCombine/x86-muldq.ll
+++ b/test/Transforms/InstCombine/x86-muldq.ll
@@ -2,6 +2,158 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
 ;
+; UNDEF Elts
+;
+
+define <2 x i64> @undef_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuludq_128(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> undef)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @undef_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuludq_256(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> undef, <8 x i32> undef)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @undef_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuludq_512(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> undef)
+  ret <8 x i64> %1
+}
+
+define <2 x i64> @undef_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuldq_128(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> undef, <4 x i32> undef)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @undef_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuldq_256(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> undef)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @undef_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuldq_512(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> undef, <16 x i32> undef)
+  ret <8 x i64> %1
+}
+
+define <2 x i64> @undef_zero_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuludq_128(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> zeroinitializer)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @undef_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuludq_256(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> undef)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @undef_zero_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuludq_512(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> zeroinitializer)
+  ret <8 x i64> %1
+}
+
+define <2 x i64> @undef_zero_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuldq_128(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> zeroinitializer, <4 x i32> undef)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @undef_zero_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuldq_256(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> zeroinitializer)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @undef_zero_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuldq_512(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> undef)
+  ret <8 x i64> %1
+}
+
+;
+; Constant Folding
+;
+
+define <2 x i64> @fold_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuludq_128(
+; CHECK-NEXT:    ret <2 x i64> <i64 9223372030412324865, i64 4294967295>
+;
+  %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2147483647, i32 1, i32 1, i32 3>)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @fold_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuludq_256(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @fold_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuludq_512(
+; CHECK-NEXT:    ret <8 x i64> <i64 0, i64 0, i64 255, i64 131070, i64 0, i64 -281474976645121, i64 140737488289792, i64 281470681743360>
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> <i32 0, i32 0, i32 undef, i32 0, i32 1, i32 1, i32 2, i32 2, i32 undef, i32 undef, i32 -1, i32 -1, i32 65536, i32 -1, i32 -65536, i32 undef>, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 255, i32 -256, i32 65535, i32 -65536, i32 0, i32 -1, i32 -65535, i32 -65535, i32 2147483647, i32 2147483648, i32 65536, i32 -65535>)
+  ret <8 x i64> %1
+}
+
+define <2 x i64> @fold_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuldq_128(
+; CHECK-NEXT:    ret <2 x i64> <i64 0, i64 2>
+;
+  %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> <i32 undef, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 undef, i32 1, i32 -2, i32 3>)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @fold_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuldq_256(
+; CHECK-NEXT:    ret <4 x i64> <i64 0, i64 4294836225, i64 140737488289792, i64 -140737488355328>
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> <i32 undef, i32 1, i32 -65535, i32 128, i32 65536, i32 2147483647, i32 -2147483648, i32 65536>, <8 x i32> <i32 0, i32 -1, i32 -65535, i32 -65535, i32 2147483647, i32 2147483648, i32 65536, i32 -65535>)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @fold_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuldq_512(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> <i32 undef, i32 -1, i32 -3, i32 -1, i32 8, i32 10, i32 -256, i32 65536, i32 undef, i32 1, i32 -65535, i32 128, i32 65536, i32 2147483647, i32 -2147483648, i32 65536>)
+  ret <8 x i64> %1
+}
+
+;
 ; PMULUDQ/PMULDQ - only the even elements (0, 2, 4, 6) of the vXi32 inputs are required.
 ;
 
@@ -55,8 +207,8 @@ define <2 x i64> @test_demanded_elts_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
   ret <2 x i64> %3
 }
 
-define <4 x i64> @test_demanded_elts_pmuluq_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: @test_demanded_elts_pmuluq_256(
+define <4 x i64> @test_demanded_elts_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pmuldq_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 undef>
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %a0, <8 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 3, i32 3>
@@ -69,8 +221,8 @@ define <4 x i64> @test_demanded_elts_pmuluq_256(<8 x i32> %a0, <8 x i32> %a1) {
   ret <4 x i64> %4
 }
 
-define <8 x i64> @test_demanded_elts_pmuluq_512(<16 x i32> %a0, <16 x i32> %a1) {
-; CHECK-LABEL: @test_demanded_elts_pmuluq_512(
+define <8 x i64> @test_demanded_elts_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pmuldq_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 undef, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 15, i32 undef>
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a0, <16 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 3, i32 3, i32 4, i32 4, i32 7, i32 7>
diff --git a/test/Transforms/InstCombine/x86-pack.ll b/test/Transforms/InstCombine/x86-pack.ll
new file mode 100644
index 000000000000..f3c41a8aa476
--- /dev/null
+++ b/test/Transforms/InstCombine/x86-pack.ll
@@ -0,0 +1,366 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+;
+; UNDEF Elts
+;
+
+define <8 x i16> @undef_packssdw_128() {
+; CHECK-LABEL: @undef_packssdw_128(
+; CHECK-NEXT:    ret <8 x i16> undef
+;
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> undef, <4 x i32> undef)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @undef_packusdw_128() {
+; CHECK-LABEL: @undef_packusdw_128(
+; CHECK-NEXT:    ret <8 x i16> undef
+;
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> undef, <4 x i32> undef)
+  ret <8 x i16> %1
+}
+
+define <16 x i8> @undef_packsswb_128() {
+; CHECK-LABEL: @undef_packsswb_128(
+; CHECK-NEXT:    ret <16 x i8> undef
+;
+  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> undef, <8 x i16> undef)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @undef_packuswb_128() {
+; CHECK-LABEL: @undef_packuswb_128(
+; CHECK-NEXT:    ret <16 x i8> undef
+;
+  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> undef, <8 x i16> undef)
+  ret <16 x i8> %1
+}
+
+define <16 x i16> @undef_packssdw_256() {
+; CHECK-LABEL: @undef_packssdw_256(
+; CHECK-NEXT:    ret <16 x i16> undef
+;
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> undef, <8 x i32> undef)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @undef_packusdw_256() {
+; CHECK-LABEL: @undef_packusdw_256(
+; CHECK-NEXT:    ret <16 x i16> undef
+;
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> undef)
+  ret <16 x i16> %1
+}
+
+define <32 x i8> @undef_packsswb_256() {
+; CHECK-LABEL: @undef_packsswb_256(
+; CHECK-NEXT:    ret <32 x i8> undef
+;
+  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> undef, <16 x i16> undef)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @undef_packuswb_256() {
+; CHECK-LABEL: @undef_packuswb_256(
+; CHECK-NEXT:    ret <32 x i8> undef
+;
+  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> undef, <16 x i16> undef)
+  ret <32 x i8> %1
+}
+
+define <32 x i16> @undef_packssdw_512() {
+; CHECK-LABEL: @undef_packssdw_512(
+; CHECK-NEXT:    ret <32 x i16> undef
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> undef, <16 x i32> undef)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @undef_packusdw_512() {
+; CHECK-LABEL: @undef_packusdw_512(
+; CHECK-NEXT:    ret <32 x i16> undef
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> undef, <16 x i32> undef)
+  ret <32 x i16> %1
+}
+
+define <64 x i8> @undef_packsswb_512() {
+; CHECK-LABEL: @undef_packsswb_512(
+; CHECK-NEXT:    ret <64 x i8> undef
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> undef, <32 x i16> undef)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @undef_packuswb_512() {
+; CHECK-LABEL: @undef_packuswb_512(
+; CHECK-NEXT:    ret <64 x i8> undef
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> undef, <32 x i16> undef)
+  ret <64 x i8> %1
+}
+
+;
+; Constant Folding
+;
+
+define <8 x i16> @fold_packssdw_128() {
+; CHECK-LABEL: @fold_packssdw_128(
+; CHECK-NEXT:    ret <8 x i16> <i16 0, i16 -1, i16 32767, i16 -32768, i16 0, i16 0, i16 0, i16 0>
+;
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> <i32 0, i32 -1, i32 65536, i32 -131072>, <4 x i32> zeroinitializer)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @fold_packusdw_128() {
+; CHECK-LABEL: @fold_packusdw_128(
+; CHECK-NEXT:    ret <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 0, i16 -32768, i16 -1>
+;
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> undef, <4 x i32> <i32 0, i32 -1, i32 32768, i32 65537>)
+  ret <8 x i16> %1
+}
+
+define <16 x i8> @fold_packsswb_128() {
+; CHECK-LABEL: @fold_packsswb_128(
+; CHECK-NEXT:    ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>
+;
+  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> zeroinitializer, <8 x i16> undef)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @fold_packuswb_128() {
+; CHECK-LABEL: @fold_packuswb_128(
+; CHECK-NEXT:    ret <16 x i8> <i8 0, i8 1, i8 0, i8 -1, i8 0, i8 0, i8 0, i8 15, i8 0, i8 127, i8 0, i8 1, i8 0, i8 1, i8 0, i8 0>
+;
+  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> <i16 0, i16 1, i16 -1, i16 255, i16 65535, i16 -32768, i16 -127, i16 15>, <8 x i16> <i16 -15, i16 127, i16 32768, i16 -65535, i16 -255, i16 1, i16 -1, i16 0>)
+  ret <16 x i8> %1
+}
+
+define <16 x i16> @fold_packssdw_256() {
+; CHECK-LABEL: @fold_packssdw_256(
+; CHECK-NEXT:    ret <16 x i16> <i16 0, i16 256, i16 32767, i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 -127, i16 -32768, i16 -32767, i16 32767, i16 undef, i16 undef, i16 undef, i16 undef>
+;
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> <i32 0, i32 256, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>, <8 x i32> undef)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @fold_packusdw_256() {
+; CHECK-LABEL: @fold_packusdw_256(
+; CHECK-NEXT:    ret <16 x i16> <i16 0, i16 0, i16 0, i16 -1, i16 0, i16 256, i16 -1, i16 0, i16 127, i16 -32768, i16 32767, i16 0, i16 0, i16 0, i16 0, i16 32767>
+;
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> <i32 0, i32 -256, i32 -65535, i32 65536, i32 127, i32 32768, i32 32767, i32 -32767>, <8 x i32> <i32 0, i32 256, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>)
+  ret <16 x i16> %1
+}
+
+define <32 x i8> @fold_packsswb_256() {
+; CHECK-LABEL: @fold_packsswb_256(
+; CHECK-NEXT:    ret <32 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+;
+  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> undef, <16 x i16> zeroinitializer)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @fold_packuswb_256() {
+; CHECK-LABEL: @fold_packuswb_256(
+; CHECK-NEXT:    ret <32 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64>
+;
+  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> zeroinitializer, <16 x i16> <i16 0, i16 -127, i16 -128, i16 -32768, i16 65536, i16 255, i16 256, i16 512, i16 -1, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64>)
+  ret <32 x i8> %1
+}
+
+define <32 x i16> @fold_packssdw_512() {
+; CHECK-LABEL: @fold_packssdw_512(
+; CHECK-NEXT:    ret <32 x i16> <i16 0, i16 512, i16 32767, i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 -127, i16 -32768, i16 -32767, i16 32767, i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 512, i16 32767, i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 -127, i16 -32768, i16 -32767, i16 32767, i16 undef, i16 undef, i16 undef, i16 undef>
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> <i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767, i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>, <16 x i32> undef)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @fold_packusdw_512() {
+; CHECK-LABEL: @fold_packusdw_512(
+; CHECK-NEXT:    ret <32 x i16> <i16 0, i16 0, i16 0, i16 -1, i16 0, i16 512, i16 -1, i16 0, i16 127, i16 -32768, i16 32767, i16 0, i16 0, i16 0, i16 0, i16 32767, i16 0, i16 0, i16 0, i16 -1, i16 0, i16 512, i16 -1, i16 0, i16 127, i16 -32768, i16 32767, i16 0, i16 0, i16 0, i16 0, i16 32767>
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> <i32 0, i32 -512, i32 -65535, i32 65536, i32 127, i32 32768, i32 32767, i32 -32767, i32 0, i32 -512, i32 -65535, i32 65536, i32 127, i32 32768, i32 32767, i32 -32767>, <16 x i32> <i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767, i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>)
+  ret <32 x i16> %1
+}
+
+define <64 x i8> @fold_packsswb_512() {
+; CHECK-LABEL: @fold_packsswb_512(
+; CHECK-NEXT:    ret <64 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> undef, <32 x i16> zeroinitializer)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @fold_packuswb_512() {
+; CHECK-LABEL: @fold_packuswb_512(
+; CHECK-NEXT:    ret <64 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64>
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> zeroinitializer, <32 x i16> <i16 0, i16 -127, i16 -128, i16 -32768, i16 65536, i16 255, i16 512, i16 512, i16 -1, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 0, i16 -127, i16 -128, i16 -32768, i16 65536, i16 255, i16 512, i16 512, i16 -1, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64>)
+  ret <64 x i8> %1
+}
+
+;
+; Demanded Elts
+;
+
+define <8 x i16> @elts_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @elts_packssdw_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[A0:%.*]], <4 x i32> undef)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+;
+  %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 undef, i32 undef>
+  %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>
+  %3 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %1, <4 x i32> %2)
+  %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i16> %4
+}
+
+define <8 x i16> @elts_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @elts_packusdw_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = insertelement <4 x i32> %a0, i32 0, i32 0
+  %2 = insertelement <4 x i32> %a1, i32 0, i32 3
+  %3 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %1, <4 x i32> %2)
+  %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef>
+  ret <8 x i16> %4
+}
+
+define <16 x i8> @elts_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @elts_packsswb_128(
+; CHECK-NEXT:    ret <16 x i8> zeroinitializer
+;
+  %1 = insertelement <8 x i16> %a0, i16 0, i32 0
+  %2 = insertelement <8 x i16> %a1, i16 0, i32 0
+  %3 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %1, <8 x i16> %2)
+  %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i8> %4
+}
+
+define <16 x i8> @elts_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @elts_packuswb_128(
+; CHECK-NEXT:    ret <16 x i8> undef
+;
+  %1 = insertelement <8 x i16> undef, i16 0, i32 0
+  %2 = insertelement <8 x i16> undef, i16 0, i32 0
+  %3 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %1, <8 x i16> %2)
+  %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  ret <16 x i8> %4
+}
+
+define <16 x i16> @elts_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @elts_packssdw_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> [[A0:%.*]], <8 x i32> undef)
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 undef, i32 6, i32 5, i32 undef>
+  %3 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %1, <8 x i32> %2)
+  %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 11, i32 12, i32 undef, i32 undef, i32 15>
+  ret <16 x i16> %4
+}
+
+define <16 x i16> @elts_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @elts_packusdw_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A1:%.*]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+;
+  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %3 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %1, <8 x i32> %2)
+  %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i16> %4
+}
+
+define <32 x i8> @elts_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: @elts_packsswb_256(
+; CHECK-NEXT:    ret <32 x i8> zeroinitializer
+;
+  %1 = insertelement <16 x i16> %a0, i16 0, i32 0
+  %2 = insertelement <16 x i16> %a1, i16 0, i32 8
+  %3 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %1, <16 x i16> %2)
+  %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+  ret <32 x i8> %4
+}
+
+define <32 x i8> @elts_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: @elts_packuswb_256(
+; CHECK-NEXT:    ret <32 x i8> undef
+;
+  %1 = insertelement <16 x i16> undef, i16 0, i32 1
+  %2 = insertelement <16 x i16> undef, i16 0, i32 0
+  %3 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %1, <16 x i16> %2)
+  %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> zeroinitializer
+  ret <32 x i8> %4
+}
+
+define <32 x i16> @elts_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @elts_packssdw_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A0:%.*]], <16 x i32> undef)
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 8, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 undef, i32 6, i32 5, i32 undef, i32 undef, i32 10, i32 9, i32 undef, i32 undef, i32 14, i32 13, i32 undef>
+  %3 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %1, <16 x i32> %2)
+  %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 11, i32 12, i32 undef, i32 undef, i32 15, i32 undef, i32 undef, i32 18, i32 19, i32 20, i32 undef, i32 undef, i32 23, i32 24, i32 undef, i32 undef, i32 27, i32 28, i32 undef, i32 undef, i32 31>
+  ret <32 x i16> %4
+}
+
+define <32 x i16> @elts_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @elts_packusdw_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A1:%.*]], <16 x i32> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> undef, <16 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
+;
+  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+  %3 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %1, <16 x i32> %2)
+  %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <32 x i16> %4
+}
+
+define <64 x i8> @elts_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) {
+; CHECK-LABEL: @elts_packsswb_512(
+; CHECK-NEXT:    ret <64 x i8> zeroinitializer
+;
+  %1 = insertelement <32 x i16> %a0, i16 0, i32 0
+  %2 = insertelement <32 x i16> %a1, i16 0, i32 8
+  %3 = insertelement <32 x i16> %1, i16 0, i32 16
+  %4 = insertelement <32 x i16> %2, i16 0, i32 24
+  %5 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %3, <32 x i16> %4)
+  %6 = shufflevector <64 x i8> %5, <64 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
+  ret <64 x i8> %6
+}
+
+define <64 x i8> @elts_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) {
+; CHECK-LABEL: @elts_packuswb_512(
+; CHECK-NEXT:    ret <64 x i8> undef
+;
+  %1 = insertelement <32 x i16> undef, i16 0, i32 1
+  %2 = insertelement <32 x i16> undef, i16 0, i32 0
+  %3 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %1, <32 x i16> %2)
+  %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> zeroinitializer
+  ret <64 x i8> %4
+}
+
+declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
+declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
+declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
+declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
+declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
+
+declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>) nounwind readnone
+declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>) nounwind readnone
+declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>) nounwind readnone
+declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) nounwind readnone
diff --git a/test/Transforms/InstCombine/x86-pshufb.ll b/test/Transforms/InstCombine/x86-pshufb.ll
index b37884ddd58a..f181ef57fe20 100644
--- a/test/Transforms/InstCombine/x86-pshufb.ll
+++ b/test/Transforms/InstCombine/x86-pshufb.ll
@@ -468,6 +468,48 @@ define <64 x i8> @fold_with_allundef_elts_avx512(<64 x i8> %InVec) {
   ret <64 x i8> %1
 }
 
+; Demanded elts tests.
+
+define <16 x i8> @demanded_elts_insertion(<16 x i8> %InVec, <16 x i8> %BaseMask, i8 %M0, i8 %M15) {
+; CHECK-LABEL: @demanded_elts_insertion(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> %BaseMask)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>
+; CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+;
+  %1 = insertelement <16 x i8> %BaseMask, i8 %M0, i32 0
+  %2 = insertelement <16 x i8> %1, i8 %M15, i32 15
+  %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> %2)
+  %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>
+  ret <16 x i8> %4
+}
+
+define <32 x i8> @demanded_elts_insertion_avx2(<32 x i8> %InVec, <32 x i8> %BaseMask, i8 %M0, i8 %M22) {
+; CHECK-LABEL: @demanded_elts_insertion_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <32 x i8> %BaseMask, i8 %M0, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> [[TMP1]])
+; CHECK-NEXT:    ret <32 x i8> [[TMP2]]
+;
+  %1 = insertelement <32 x i8> %BaseMask, i8 %M0, i32 0
+  %2 = insertelement <32 x i8> %1, i8 %M22, i32 22
+  %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> %2)
+  %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 undef, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i8> %4
+}
+
+define <64 x i8> @demanded_elts_insertion_avx512(<64 x i8> %InVec, <64 x i8> %BaseMask, i8 %M0, i8 %M30) {
+; CHECK-LABEL: @demanded_elts_insertion_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <64 x i8> undef, i8 %M0, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <64 x i8> [[TMP2]], <64 x i8> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
+;
+  %1 = insertelement <64 x i8> %BaseMask, i8 %M0, i32 0
+  %2 = insertelement <64 x i8> %1, i8 %M30, i32 30
+  %3 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> %2)
+  %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> zeroinitializer
+  ret <64 x i8> %4
+}
+
 declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
 declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)
 declare <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>)
diff --git a/test/Transforms/InstCombine/x86-vpermil.ll b/test/Transforms/InstCombine/x86-vpermil.ll
index fad10d7ad5c5..f68eb36c4b58 100644
--- a/test/Transforms/InstCombine/x86-vpermil.ll
+++ b/test/Transforms/InstCombine/x86-vpermil.ll
@@ -221,6 +221,74 @@ define <8 x double> @undef_test_vpermilvar_pd_512(<8 x double> %v) {
   ret <8 x double> %a
 }
 
+; Simplify demanded elts
+
+define <4 x float> @elts_test_vpermilvar_ps(<4 x float> %a0, i32 %a1) {
+; CHECK-LABEL: @elts_test_vpermilvar_ps(
+; CHECK-NEXT:    ret <4 x float> %a0
+;
+  %1 = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %a1, i32 3
+  %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %1)
+  %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  ret <4 x float> %3
+}
+
+define <8 x float> @elts_test_vpermilvar_ps_256(<8 x float> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @elts_test_vpermilvar_ps_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 6, i32 undef, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = shufflevector <8 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0>, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %1)
+  %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 3, i32 undef, i32 5, i32 undef, i32 7>
+  ret <8 x float> %3
+}
+
+define <16 x float> @elts_test_vpermilvar_ps_512(<16 x float> %a0, <16 x i32> %a1, i32 %a2) {
+; CHECK-LABEL: @elts_test_vpermilvar_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %a1)
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = insertelement <16 x i32> %a1, i32 %a2, i32 0
+  %2 = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %1)
+  %3 = shufflevector <16 x float> %2, <16 x float> undef, <16 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %3
+}
+
+define <2 x double> @elts_test_vpermilvar_pd(<2 x double> %a0, i64 %a1) {
+; CHECK-LABEL: @elts_test_vpermilvar_pd(
+; CHECK-NEXT:    ret <2 x double> %a0
+;
+  %1 = insertelement <2 x i64> <i64 0, i64 2>, i64 %a1, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %1)
+  %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
+  ret <2 x double> %3
+}
+
+define <4 x double> @elts_test_vpermilvar_pd_256(<4 x double> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: @elts_test_vpermilvar_pd_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 undef>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %1 = shufflevector <4 x i64> <i64 0, i64 2, i64 0, i64 2>, <4 x i64> %a1, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  %2 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %1)
+  %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  ret <4 x double> %3
+}
+
+define <8 x double> @elts_test_vpermilvar_pd_512(<8 x double> %a0, <8 x i64> %a1, i64 %a2) {
+; CHECK-LABEL: @elts_test_vpermilvar_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i64> undef, i64 %a2, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = insertelement <8 x i64> %a1, i64 %a2, i32 0
+  %2 = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %1)
+  %3 = shufflevector <8 x double> %2, <8 x double> undef, <8 x i32> zeroinitializer
+  ret <8 x double> %3
+}
+
 declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>)
 declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>)
 declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>)
diff --git a/test/Transforms/InstCombine/xor.ll b/test/Transforms/InstCombine/xor.ll
index cd137776bbfd..570155b16232 100644
--- a/test/Transforms/InstCombine/xor.ll
+++ b/test/Transforms/InstCombine/xor.ll
@@ -321,7 +321,7 @@ define i32 @test25(i32 %g, i32 %h) {
 
 define i32 @test26(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test26(
-; CHECK-NEXT:    [[T4:%.*]] = and i32 %a, %b
+; CHECK-NEXT:    [[T4:%.*]] = and i32 %b, %a
 ; CHECK-NEXT:    ret i32 [[T4]]
 ;
   %b2 = xor i32 %b, -1
@@ -352,3 +352,187 @@ define i32 @test28(i32 %indvar) {
   %t214 = xor i32 %t7, -2147483648
   ret i32 %t214
 }
+
+define i32 @test29(i1 %C) {
+; CHECK-LABEL: @test29(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], i32 915, i32 113
+; CHECK-NEXT:    ret i32 [[V]]
+;
+  %A = select i1 %C, i32 1000, i32 10
+  %V = xor i32 %A, 123
+  ret i32 %V
+}
+
+define <2 x i32> @test29vec(i1 %C) {
+; CHECK-LABEL: @test29vec(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], <2 x i32> <i32 915, i32 915>, <2 x i32> <i32 113, i32 113>
+; CHECK-NEXT:    ret <2 x i32> [[V]]
+;
+  %A = select i1 %C, <2 x i32> <i32 1000, i32 1000>, <2 x i32> <i32 10, i32 10>
+  %V = xor <2 x i32> %A, <i32 123, i32 123>
+  ret <2 x i32> %V
+}
+
+define <2 x i32> @test29vec2(i1 %C) {
+; CHECK-LABEL: @test29vec2(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], <2 x i32> <i32 915, i32 2185>, <2 x i32> <i32 113, i32 339>
+; CHECK-NEXT:    ret <2 x i32> [[V]]
+;
+  %A = select i1 %C, <2 x i32> <i32 1000, i32 2500>, <2 x i32> <i32 10, i32 30>
+  %V = xor <2 x i32> %A, <i32 123, i32 333>
+  ret <2 x i32> %V
+}
+
+define i32 @test30(i1 %which) {
+; CHECK-LABEL: @test30(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi i32 [ 915, [[ENTRY:%.*]] ], [ 113, [[DELAY]] ]
+; CHECK-NEXT:    ret i32 [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi i32 [ 1000, %entry ], [ 10, %delay ]
+  %value = xor i32 %A, 123
+  ret i32 %value
+}
+
+define <2 x i32> @test30vec(i1 %which) {
+; CHECK-LABEL: @test30vec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi <2 x i32> [ <i32 915, i32 915>, [[ENTRY:%.*]] ], [ <i32 113, i32 113>, [[DELAY]] ]
+; CHECK-NEXT:    ret <2 x i32> [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi <2 x i32> [ <i32 1000, i32 1000>, %entry ], [ <i32 10, i32 10>, %delay ]
+  %value = xor <2 x i32> %A, <i32 123, i32 123>
+  ret <2 x i32> %value
+}
+
+define <2 x i32> @test30vec2(i1 %which) {
+; CHECK-LABEL: @test30vec2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi <2 x i32> [ <i32 915, i32 2185>, [[ENTRY:%.*]] ], [ <i32 113, i32 339>, [[DELAY]] ]
+; CHECK-NEXT:    ret <2 x i32> [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi <2 x i32> [ <i32 1000, i32 2500>, %entry ], [ <i32 10, i32 30>, %delay ]
+  %value = xor <2 x i32> %A, <i32 123, i32 333>
+  ret <2 x i32> %value
+}
+
+define i32 @test31(i32 %A, i32 %B) {
+; CHECK-LABEL: @test31(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[B:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = and i32 [[TMP1]], [[A:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %and = or i32 %A, %B
+  %xor = xor i32 %B, %and
+  ret i32 %xor
+}
+
+define i32 @test32(i32 %A, i32 %B) {
+; CHECK-LABEL: @test32(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[B:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = and i32 [[TMP1]], [[A:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %and = or i32 %B, %A
+  %xor = xor i32 %B, %and
+  ret i32 %xor
+}
+
+define i32 @test33(i32 %A, i32 %B) {
+; CHECK-LABEL: @test33(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[B:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = and i32 [[TMP1]], [[A:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %and = or i32 %A, %B
+  %xor = xor i32 %and, %B
+  ret i32 %xor
+}
+
+define i32 @test34(i32 %A, i32 %B) {
+; CHECK-LABEL: @test34(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[B:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = and i32 [[TMP1]], [[A:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %and = or i32 %B, %A
+  %xor = xor i32 %and, %B
+  ret i32 %xor
+}
+
+define i32 @test35(i32 %A, i32 %B) {
+; CHECK-LABEL: @test35(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = and i32 [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %and = and i32 %A, %B
+  %xor = xor i32 %B, %and
+  ret i32 %xor
+}
+
+define i32 @test36(i32 %A, i32 %B) {
+; CHECK-LABEL: @test36(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = and i32 [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %and = and i32 %B, %A
+  %xor = xor i32 %B, %and
+  ret i32 %xor
+}
+
+define i32 @test37(i32 %A, i32 %B) {
+; CHECK-LABEL: @test37(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = and i32 [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %and = and i32 %A, %B
+  %xor = xor i32 %and, %B
+  ret i32 %xor
+}
+
+define i32 @test38(i32 %A, i32 %B) {
+; CHECK-LABEL: @test38(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = and i32 [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %and = and i32 %B, %A
+  %xor = xor i32 %and, %B
+  ret i32 %xor
+}
diff --git a/test/Transforms/InstCombine/xor2.ll b/test/Transforms/InstCombine/xor2.ll
index f3591ed9c8a9..79e62723f143 100644
--- a/test/Transforms/InstCombine/xor2.ll
+++ b/test/Transforms/InstCombine/xor2.ll
@@ -110,7 +110,7 @@ define i32 @test6(i32 %x) {
 define i32 @test7(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:    [[B_NOT:%.*]] = xor i32 %b, -1
-; CHECK-NEXT:    [[XOR:%.*]] = or i32 %a, [[B_NOT]]
+; CHECK-NEXT:    [[XOR:%.*]] = or i32 [[B_NOT]], %a
 ; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %or = or i32 %a, %b
@@ -123,7 +123,7 @@ define i32 @test7(i32 %a, i32 %b) {
 define i32 @test8(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test8(
 ; CHECK-NEXT:    [[B_NOT:%.*]] = xor i32 %b, -1
-; CHECK-NEXT:    [[XOR:%.*]] = or i32 %a, [[B_NOT]]
+; CHECK-NEXT:    [[XOR:%.*]] = or i32 [[B_NOT]], %a
 ; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %neg = xor i32 %a, -1
@@ -144,6 +144,18 @@ define i32 @test9(i32 %b, i32 %c) {
   ret i32 %xor2
 }
 
+; (A & B) ^ (B ^ A) -> (A | B)
+define i32 @test9b(i32 %b, i32 %c) {
+; CHECK-LABEL: @test9b(
+; CHECK-NEXT:    [[XOR2:%.*]] = or i32 [[B:%.*]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR2]]
+;
+  %and = and i32 %b, %c
+  %xor = xor i32 %c, %b
+  %xor2 = xor i32 %and, %xor
+  ret i32 %xor2
+}
+
 ; (A ^ B) ^ (A & B) -> (A | B)
 define i32 @test10(i32 %b, i32 %c) {
 ; CHECK-LABEL: @test10(
@@ -156,6 +168,18 @@ define i32 @test10(i32 %b, i32 %c) {
   ret i32 %xor2
 }
 
+; (A ^ B) ^ (A & B) -> (A | B)
+define i32 @test10b(i32 %b, i32 %c) {
+; CHECK-LABEL: @test10b(
+; CHECK-NEXT:    [[XOR2:%.*]] = or i32 [[B:%.*]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR2]]
+;
+  %xor = xor i32 %b, %c
+  %and = and i32 %c, %b
+  %xor2 = xor i32 %xor, %and
+  ret i32 %xor2
+}
+
 define i32 @test11(i32 %A, i32 %B) {
 ; CHECK-LABEL: @test11(
 ; CHECK-NEXT:    ret i32 0
diff --git a/test/Transforms/InstCombine/zero-point-zero-add.ll b/test/Transforms/InstCombine/zero-point-zero-add.ll
index e466e8ad7429..a23db75525e9 100644
--- a/test/Transforms/InstCombine/zero-point-zero-add.ll
+++ b/test/Transforms/InstCombine/zero-point-zero-add.ll
@@ -15,7 +15,7 @@ define double @test(double %X) {
 
 define double @test1(double %X) {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[Y:%.*]] = call double @fabs(double %X)
+; CHECK-NEXT:    [[Y:%.*]] = call double @llvm.fabs.f64(double %X)
 ; CHECK-NEXT:    ret double [[Y]]
 ;
   %Y = call double @fabs(double %X)
diff --git a/test/Transforms/InstCombine/zext-or-icmp.ll b/test/Transforms/InstCombine/zext-or-icmp.ll
index 610e9a754f0d..afbe36da3e37 100644
--- a/test/Transforms/InstCombine/zext-or-icmp.ll
+++ b/test/Transforms/InstCombine/zext-or-icmp.ll
@@ -19,3 +19,33 @@ define i8 @zext_or_icmp_icmp(i8 %a, i8 %b) {
 ; CHECK-NEXT:    ret i8 %zext
 }
 
+; Here, widening the or from i1 to i32 and removing one of the icmps would
+; widen an undef value (created by the out-of-range shift), increasing the
+; range of valid values for the return, so we can't do it.
+define i32 @dont_widen_undef() {
+entry:
+  br label %block2
+
+block1:
+  br label %block2
+
+block2:
+  %m.011 = phi i32 [ 33, %entry ], [ 0, %block1 ]
+  %cmp.i = icmp ugt i32 %m.011, 1
+  %m.1.op = lshr i32 1, %m.011
+  %sext.mask = and i32 %m.1.op, 65535
+  %cmp115 = icmp ne i32 %sext.mask, 0
+  %cmp1 = or i1 %cmp.i, %cmp115
+  %conv2 = zext i1 %cmp1 to i32
+  ret i32 %conv2
+
+; CHECK-LABEL: dont_widen_undef(
+; CHECK:         %m.011 = phi i32 [ 33, %entry ], [ 0, %block1 ]
+; CHECK-NEXT:    %cmp.i = icmp ugt i32 %m.011, 1
+; CHECK-NEXT:    %m.1.op = lshr i32 1, %m.011
+; CHECK-NEXT:    %sext.mask = and i32 %m.1.op, 65535
+; CHECK-NEXT:    %cmp115 = icmp ne i32 %sext.mask, 0
+; CHECK-NEXT:    %cmp1 = or i1 %cmp.i, %cmp115
+; CHECK-NEXT:    %conv2 = zext i1 %cmp1 to i32
+; CHECK-NEXT:    ret i32 %conv2
+}
diff --git a/test/Transforms/InstCombine/zext-phi.ll b/test/Transforms/InstCombine/zext-phi.ll
new file mode 100644
index 000000000000..5e352415c747
--- /dev/null
+++ b/test/Transforms/InstCombine/zext-phi.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n8:16:32:64"
+
+; Although i1 is not in the datalayout, we should treat it
+; as a legal type because it is a fundamental type in IR.
+; This means we should shrink the phi (sink the zexts).
+
+define i64 @sink_i1_casts(i1 %cond1, i1 %cond2) {
+; CHECK-LABEL: @sink_i1_casts(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 %cond1, label %if, label %end
+; CHECK:       if:
+; CHECK-NEXT:    br label %end
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI_IN:%.*]] = phi i1 [ %cond1, %entry ], [ %cond2, %if ]
+; CHECK-NEXT:    [[PHI:%.*]] = zext i1 [[PHI_IN]] to i64
+; CHECK-NEXT:    ret i64 [[PHI]]
+;
+entry:
+  %z1 = zext i1 %cond1 to i64
+  br i1 %cond1, label %if, label %end
+
+if:
+  %z2 = zext i1 %cond2 to i64
+  br label %end
+
+end:
+  %phi = phi i64 [ %z1, %entry ], [ %z2, %if ]
+  ret i64 %phi
+}
+
diff --git a/test/Transforms/InstCombine/zext.ll b/test/Transforms/InstCombine/zext.ll
index 740509809d1c..887d839cb8c7 100644
--- a/test/Transforms/InstCombine/zext.ll
+++ b/test/Transforms/InstCombine/zext.ll
@@ -35,7 +35,7 @@ define <2 x i64> @test3(<2 x i64> %A) {
 
 define <2 x i64> @test4(<2 x i64> %A) {
 ; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i64> %A, <i64 4294967295, i64 4294967295>
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i64> %A, <i64 63, i64 63>
 ; CHECK-NEXT:    [[XOR:%.*]] = and <2 x i64> [[TMP1]], <i64 23, i64 42>
 ; CHECK-NEXT:    ret <2 x i64> [[XOR]]
 ;
diff --git a/test/Transforms/InstSimplify/AndOrXor.ll b/test/Transforms/InstSimplify/AndOrXor.ll
index c6959d72961d..33fd978277d4 100644
--- a/test/Transforms/InstSimplify/AndOrXor.ll
+++ b/test/Transforms/InstSimplify/AndOrXor.ll
@@ -1,6 +1,28 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instsimplify -S | FileCheck %s
 
+; add nsw (xor X, signbit), signbit --> X
+
+define <2 x i32> @add_nsw_signbit(<2 x i32> %x) {
+; CHECK-LABEL: @add_nsw_signbit(
+; CHECK-NEXT:    ret <2 x i32> %x
+;
+  %y = xor <2 x i32> %x, <i32 -2147483648, i32 -2147483648>
+  %z = add nsw <2 x i32> %y, <i32 -2147483648, i32 -2147483648>
+  ret <2 x i32> %z
+}
+
+; add nuw (xor X, signbit), signbit --> X
+
+define <2 x i5> @add_nuw_signbit(<2 x i5> %x) {
+; CHECK-LABEL: @add_nuw_signbit(
+; CHECK-NEXT:    ret <2 x i5> %x
+;
+  %y = xor <2 x i5> %x, <i5 -16, i5 -16>
+  %z = add nuw <2 x i5> %y, <i5 -16, i5 -16>
+  ret <2 x i5> %z
+}
+
 define i64 @pow2(i32 %x) {
 ; CHECK-LABEL: @pow2(
 ; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, %x
diff --git a/test/Transforms/InstSimplify/addsub.ll b/test/Transforms/InstSimplify/addsub.ll
new file mode 100644
index 000000000000..2f19a4d205e7
--- /dev/null
+++ b/test/Transforms/InstSimplify/addsub.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+define i1 @test1(i1 %a) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    ret i1 true
+;
+  %b = xor i1 %a, true
+  %res = sub i1 %a, %b
+  ret i1 %res
+}
+
+define <2 x i1> @test2(<2 x i1> %a) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
+;
+  %b = xor <2 x i1> %a, <i1 true, i1 true>
+  %res = sub <2 x i1> %a, %b
+  ret <2 x i1> %res
+}
+
+define i1 @test5(i1 %a) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    ret i1 false
+;
+  %res = add i1 %a, %a
+  ret i1 %res
+}
+
+define <2 x i1> @test6(<2 x i1> %a) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %res = add <2 x i1> %a, %a
+  ret <2 x i1> %res
+}
+
+define i1 @test7(i1 %a) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    ret i1 [[A:%.*]]
+;
+  %c = xor i1 %a, true
+  %res = add i1 %c, true
+  ret i1 %res
+}
+
+; TODO: simplify this to %a
+define i1 @test8(i1 %a) {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    [[C:%.*]] = add i1 [[A:%.*]], true
+; CHECK-NEXT:    [[RES:%.*]] = xor i1 [[C]], true
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %c = add i1 %a, true
+  %res = xor i1 %c, true
+  ret i1 %res
+}
+
+define i1 @test9(i1 %a) {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    ret i1 [[A:%.*]]
+;
+  %c = xor i1 %a, true
+  %res = sub i1 %c, true
+  ret i1 %res
+}
+
+; TODO: simplify this to %a
+define i1 @test10(i1 %a) {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    [[C:%.*]] = sub i1 [[A:%.*]], true
+; CHECK-NEXT:    [[RES:%.*]] = xor i1 [[C]], true
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %c = sub i1 %a, true
+  %res = xor i1 %c, true
+  ret i1 %res
+}
diff --git a/test/Transforms/InstSimplify/assume.ll b/test/Transforms/InstSimplify/assume.ll
index 2487a9c8bb15..66f2120f2928 100644
--- a/test/Transforms/InstSimplify/assume.ll
+++ b/test/Transforms/InstSimplify/assume.ll
@@ -1,5 +1,10 @@
 ; NOTE: Assertions have been autogenerated by update_test_checks.py
-; RUN: opt -instsimplify -S < %s | FileCheck %s
+; RUN: opt -instsimplify -S < %s 2>&1 -pass-remarks-analysis=.* | FileCheck %s
+
+; Verify that warnings are emitted for the 2nd and 3rd tests.
+
+; CHECK: remark: /tmp/s.c:1:13: Detected conflicting code assumptions.
+; CHECK: remark: /tmp/s.c:4:10: Detected conflicting code assumptions.
 
 define void @test1() {
 ; CHECK-LABEL: @test1(
@@ -10,5 +15,58 @@ define void @test1() {
 
 }
 
+; The alloca guarantees that the low bits of %a are zero because of alignment.
+; The assume says the opposite. The assume is processed last, so that's the 
+; return value. There's no way to win (we can't undo transforms that happened
+; based on half-truths), so just don't crash.
+
+define i64 @PR31809() !dbg !7 {
+; CHECK-LABEL: @PR31809(
+; CHECK-NEXT:    ret i64 3
+;
+  %a = alloca i32
+  %t1 = ptrtoint i32* %a to i64, !dbg !9
+  %cond = icmp eq i64 %t1, 3
+  call void @llvm.assume(i1 %cond)
+  ret i64 %t1
+}
+
+; Similar to above: there's no way to know which assumption is truthful,
+; so just don't crash. The second icmp+assume gets processed later, so that
+; determines the return value.
+
+define i8 @conflicting_assumptions(i8 %x) !dbg !10 {
+; CHECK-LABEL: @conflicting_assumptions(
+; CHECK-NEXT:    call void @llvm.assume(i1 false)
+; CHECK-NEXT:    [[COND2:%.*]] = icmp eq i8 %x, 4
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND2]])
+; CHECK-NEXT:    ret i8 5
+;
+  %add = add i8 %x, 1, !dbg !11
+  %cond1 = icmp eq i8 %x, 3
+  call void @llvm.assume(i1 %cond1)
+  %cond2 = icmp eq i8 %x, 4
+  call void @llvm.assume(i1 %cond2)
+  ret i8 %add
+}
+
 declare void @llvm.assume(i1) nounwind
 
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 282540) (llvm/trunk 282542)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "/tmp/s.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"PIC Level", i32 2}
+!6 = !{!"clang version 4.0.0 (trunk 282540) (llvm/trunk 282542)"}
+!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, variables: !2)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 1, column: 13, scope: !7)
+!10 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 3, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: true, unit: !0, variables: !2)
+!11 = !DILocation(line: 4, column: 10, scope: !10)
+!12 = !DILocation(line: 4, column: 3, scope: !10)
+
diff --git a/test/Transforms/InstSimplify/bitreverse.ll b/test/Transforms/InstSimplify/bitreverse.ll
new file mode 100644
index 000000000000..d87b68831fe5
--- /dev/null
+++ b/test/Transforms/InstSimplify/bitreverse.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s -S -instsimplify | FileCheck %s
+
+declare i32 @llvm.bitreverse.i32(i32)
+
+; CHECK-LABEL: @test1(
+; CHECK: ret i1 false
+define i1 @test1(i32 %arg) {
+  %a = or i32 %arg, 1
+  %b = call i32 @llvm.bitreverse.i32(i32 %a)
+  %res = icmp eq i32 %b, 0
+  ret i1 %res
+}
+
+; CHECK-LABEL: @test2(
+; CHECK: ret i1 false
+define i1 @test2(i32 %arg) {
+  %a = or i32 %arg, 1024
+  %b = call i32 @llvm.bitreverse.i32(i32 %a)
+  %res = icmp eq i32 %b, 0
+  ret i1 %res
+}
+
+; CHECK-LABEL: @test3(
+; CHECK: ret i1 false
+define i1 @test3(i32 %arg) {
+  %a = and i32 %arg, 1
+  %b = call i32 @llvm.bitreverse.i32(i32 %a)
+  %and = and i32 %b, 1
+  %res = icmp eq i32 %and, 1
+  ret i1 %res
+}
diff --git a/test/Transforms/InstSimplify/div.ll b/test/Transforms/InstSimplify/div.ll
index b8ce34aaa37e..f096719359dc 100644
--- a/test/Transforms/InstSimplify/div.ll
+++ b/test/Transforms/InstSimplify/div.ll
@@ -1,10 +1,64 @@
 ; RUN: opt < %s -instsimplify -S | FileCheck %s
 
+; Division-by-zero is undef. UB in any vector lane means the whole op is undef.
+
+define <2 x i8> @sdiv_zero_elt_vec_constfold(<2 x i8> %x) {
+; CHECK-LABEL: @sdiv_zero_elt_vec_constfold(
+; CHECK-NEXT:    ret <2 x i8> undef
+;
+  %div = sdiv <2 x i8> <i8 1, i8 2>, <i8 0, i8 -42>
+  ret <2 x i8> %div
+}
+
+define <2 x i8> @udiv_zero_elt_vec_constfold(<2 x i8> %x) {
+; CHECK-LABEL: @udiv_zero_elt_vec_constfold(
+; CHECK-NEXT:    ret <2 x i8> undef
+;
+  %div = udiv <2 x i8> <i8 1, i8 2>, <i8 42, i8 0>
+  ret <2 x i8> %div
+}
+
+define <2 x i8> @sdiv_zero_elt_vec(<2 x i8> %x) {
+; CHECK-LABEL: @sdiv_zero_elt_vec(
+; CHECK-NEXT:    ret <2 x i8> undef
+;
+  %div = sdiv <2 x i8> %x, <i8 -42, i8 0>
+  ret <2 x i8> %div
+}
+
+define <2 x i8> @udiv_zero_elt_vec(<2 x i8> %x) {
+; CHECK-LABEL: @udiv_zero_elt_vec(
+; CHECK-NEXT:    ret <2 x i8> undef
+;
+  %div = udiv <2 x i8> %x, <i8 0, i8 42>
+  ret <2 x i8> %div
+}
+
+; Division-by-zero is undef. UB in any vector lane means the whole op is undef.
+; Thus, we can simplify this: if any element of 'y' is 0, we can do anything.
+; Therefore, assume that all elements of 'y' must be 1.
+
+define <2 x i1> @sdiv_bool_vec(<2 x i1> %x, <2 x i1> %y) {
+; CHECK-LABEL: @sdiv_bool_vec(
+; CHECK-NEXT:    ret <2 x i1> %x
+;
+  %div = sdiv <2 x i1> %x, %y
+  ret <2 x i1> %div
+}
+
+define <2 x i1> @udiv_bool_vec(<2 x i1> %x, <2 x i1> %y) {
+; CHECK-LABEL: @udiv_bool_vec(
+; CHECK-NEXT:    ret <2 x i1> %x
+;
+  %div = udiv <2 x i1> %x, %y
+  ret <2 x i1> %div
+}
+
 declare i32 @external()
 
 define i32 @div1() {
 ; CHECK-LABEL: @div1(
-; CHECK:         [[CALL:%.*]] = call i32 @external(), !range !0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @external(), !range !0
 ; CHECK-NEXT:    ret i32 0
 ;
   %call = call i32 @external(), !range !0
diff --git a/test/Transforms/InstSimplify/fdiv.ll b/test/Transforms/InstSimplify/fdiv.ll
index bb7f443f4238..6643afd81471 100644
--- a/test/Transforms/InstSimplify/fdiv.ll
+++ b/test/Transforms/InstSimplify/fdiv.ll
@@ -1,9 +1,25 @@
-; NOTE: Assertions have been autogenerated by update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instsimplify -S | FileCheck %s
 
+define float @fdiv_constant_fold() {
+; CHECK-LABEL: @fdiv_constant_fold(
+; CHECK-NEXT:    ret float 1.500000e+00
+;
+  %f = fdiv float 3.0, 2.0
+  ret float %f
+}
+
+define float @frem_constant_fold() {
+; CHECK-LABEL: @frem_constant_fold(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %f = frem float 3.0, 2.0
+  ret float %f
+}
+
 define double @fdiv_of_undef(double %X) {
 ; CHECK-LABEL: @fdiv_of_undef(
-; CHECK:         ret double undef
+; CHECK-NEXT:    ret double undef
 ;
 ; undef / X -> undef
   %r = fdiv double undef, %X
@@ -12,7 +28,7 @@ define double @fdiv_of_undef(double %X) {
 
 define double @fdiv_by_undef(double %X) {
 ; CHECK-LABEL: @fdiv_by_undef(
-; CHECK:         ret double undef
+; CHECK-NEXT:    ret double undef
 ;
 ; X / undef -> undef
   %r = fdiv double %X, undef
diff --git a/test/Transforms/InstSimplify/floating-point-arithmetic.ll b/test/Transforms/InstSimplify/floating-point-arithmetic.ll
index dfdb88dcc858..e635032e6b71 100644
--- a/test/Transforms/InstSimplify/floating-point-arithmetic.ll
+++ b/test/Transforms/InstSimplify/floating-point-arithmetic.ll
@@ -104,6 +104,7 @@ define float @PR22688(float %x) {
 }
 
 declare float @llvm.fabs.f32(float)
+declare float @llvm.sqrt.f32(float)
 
 ; CHECK-LABEL: @fabs_select_positive_constants(
 ; CHECK: %select = select i1 %cmp, float 1.000000e+00, float 2.000000e+00
@@ -195,3 +196,56 @@ define float @fabs_select_negnan_zero(float addrspace(1)* %out, i32 %c) {
   %fabs = call float @llvm.fabs.f32(float %select)
   ret float %fabs
 }
+
+; CHECK-LABEL: @fabs_sqrt
+; CHECK: call float @llvm.sqrt.f32
+; CHECK: call float @llvm.fabs.f32
+define float @fabs_sqrt(float %a) {
+; The fabs can't be eliminated because llvm.sqrt.f32 may return -0 or NaN with
+; an arbitrary sign bit.
+  %sqrt = call float @llvm.sqrt.f32(float %a)
+  %fabs = call float @llvm.fabs.f32(float %sqrt)
+  ret float %fabs
+}
+
+; CHECK-LABEL: @fabs_sqrt_nnan
+; CHECK: call nnan float @llvm.sqrt.f32
+; CHECK: call float @llvm.fabs.f32
+define float @fabs_sqrt_nnan(float %a) {
+; The fabs can't be eliminated because the nnan sqrt may still return -0.
+  %sqrt = call nnan float @llvm.sqrt.f32(float %a)
+  %fabs = call float @llvm.fabs.f32(float %sqrt)
+  ret float %fabs
+}
+
+; CHECK-LABEL: @fabs_sqrt_nsz
+; CHECK: call nsz float @llvm.sqrt.f32
+; CHECK: call float @llvm.fabs.f32
+define float @fabs_sqrt_nsz(float %a) {
+; The fabs can't be eliminated because the nsz sqrt may still return NaN.
+  %sqrt = call nsz float @llvm.sqrt.f32(float %a)
+  %fabs = call float @llvm.fabs.f32(float %sqrt)
+  ret float %fabs
+}
+
+; CHECK-LABEL: @fabs_sqrt_nnan_nsz
+; CHECK: call nnan nsz float @llvm.sqrt.f32
+; CHECK-NOT: call float @llvm.fabs.f32
+define float @fabs_sqrt_nnan_nsz(float %a) {
+; The fabs can be eliminated because we're nsz and nnan.
+  %sqrt = call nnan nsz float @llvm.sqrt.f32(float %a)
+  %fabs = call float @llvm.fabs.f32(float %sqrt)
+  ret float %fabs
+}
+
+; CHECK-LABEL: @fabs_sqrt_nnan_fabs
+; CHECK: call float @llvm.fabs.f32
+; CHECK: call nnan float @llvm.sqrt.f32
+; CHECK-NOT: call float @llvm.fabs.f32
+define float @fabs_sqrt_nnan_fabs(float %a) {
+; The second fabs can be eliminated because the operand to sqrt cannot be -0.
+  %b = call float @llvm.fabs.f32(float %a)
+  %sqrt = call nnan float @llvm.sqrt.f32(float %b)
+  %fabs = call float @llvm.fabs.f32(float %sqrt)
+  ret float %fabs
+}
diff --git a/test/Transforms/InstSimplify/icmp-constant.ll b/test/Transforms/InstSimplify/icmp-constant.ll
index 85de1a45ea27..918722299b59 100644
--- a/test/Transforms/InstSimplify/icmp-constant.ll
+++ b/test/Transforms/InstSimplify/icmp-constant.ll
@@ -416,3 +416,158 @@ define <2 x i1> @tautological9_vec(<2 x i32> %x) {
   ret <2 x i1> %cmp
 }
 
+; The upper bound of the 'add' is 0.
+
+define i1 @add_nsw_neg_const1(i32 %x) {
+; CHECK-LABEL: @add_nsw_neg_const1(
+; CHECK-NEXT:    ret i1 false
+;
+  %add = add nsw i32 %x, -2147483647
+  %cmp = icmp sgt i32 %add, 0
+  ret i1 %cmp
+}
+
+; InstCombine can fold this, but not InstSimplify.
+
+define i1 @add_nsw_neg_const2(i32 %x) {
+; CHECK-LABEL: @add_nsw_neg_const2(
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 %x, -2147483647
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[ADD]], -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = add nsw i32 %x, -2147483647
+  %cmp = icmp sgt i32 %add, -1
+  ret i1 %cmp
+}
+
+; The upper bound of the 'add' is 1 (move the constants to prove we're doing range-based analysis).
+
+define i1 @add_nsw_neg_const3(i32 %x) {
+; CHECK-LABEL: @add_nsw_neg_const3(
+; CHECK-NEXT:    ret i1 false
+;
+  %add = add nsw i32 %x, -2147483646
+  %cmp = icmp sgt i32 %add, 1
+  ret i1 %cmp
+}
+
+; InstCombine can fold this, but not InstSimplify.
+
+define i1 @add_nsw_neg_const4(i32 %x) {
+; CHECK-LABEL: @add_nsw_neg_const4(
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 %x, -2147483646
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[ADD]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = add nsw i32 %x, -2147483646
+  %cmp = icmp sgt i32 %add, 0
+  ret i1 %cmp
+}
+
+; The upper bound of the 'add' is 2147483647 - 42 = 2147483605 (move the constants again and try a different cmp predicate).
+
+define i1 @add_nsw_neg_const5(i32 %x) {
+; CHECK-LABEL: @add_nsw_neg_const5(
+; CHECK-NEXT:    ret i1 true
+;
+  %add = add nsw i32 %x, -42
+  %cmp = icmp ne i32 %add, 2147483606
+  ret i1 %cmp
+}
+
+; InstCombine can fold this, but not InstSimplify.
+
+define i1 @add_nsw_neg_const6(i32 %x) {
+; CHECK-LABEL: @add_nsw_neg_const6(
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 %x, -42
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[ADD]], 2147483605
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = add nsw i32 %x, -42
+  %cmp = icmp ne i32 %add, 2147483605
+  ret i1 %cmp
+}
+
+; The lower bound of the 'add' is -1.
+
+define i1 @add_nsw_pos_const1(i32 %x) {
+; CHECK-LABEL: @add_nsw_pos_const1(
+; CHECK-NEXT:    ret i1 false
+;
+  %add = add nsw i32 %x, 2147483647
+  %cmp = icmp slt i32 %add, -1
+  ret i1 %cmp
+}
+
+; InstCombine can fold this, but not InstSimplify.
+
+define i1 @add_nsw_pos_const2(i32 %x) {
+; CHECK-LABEL: @add_nsw_pos_const2(
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 %x, 2147483647
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[ADD]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = add nsw i32 %x, 2147483647
+  %cmp = icmp slt i32 %add, 0
+  ret i1 %cmp
+}
+
+; The lower bound of the 'add' is -2 (move the constants to prove we're doing range-based analysis).
+
+define i1 @add_nsw_pos_const3(i32 %x) {
+; CHECK-LABEL: @add_nsw_pos_const3(
+; CHECK-NEXT:    ret i1 false
+;
+  %add = add nsw i32 %x, 2147483646
+  %cmp = icmp slt i32 %add, -2
+  ret i1 %cmp
+}
+
+; InstCombine can fold this, but not InstSimplify.
+
+define i1 @add_nsw_pos_const4(i32 %x) {
+; CHECK-LABEL: @add_nsw_pos_const4(
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 %x, 2147483646
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[ADD]], -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = add nsw i32 %x, 2147483646
+  %cmp = icmp slt i32 %add, -1
+  ret i1 %cmp
+}
+
+; The lower bound of the 'add' is -2147483648 + 42 = -2147483606 (move the constants again and change the cmp predicate).
+
+define i1 @add_nsw_pos_const5(i32 %x) {
+; CHECK-LABEL: @add_nsw_pos_const5(
+; CHECK-NEXT:    ret i1 false
+;
+  %add = add nsw i32 %x, 42
+  %cmp = icmp eq i32 %add, -2147483607
+  ret i1 %cmp
+}
+
+; InstCombine can fold this, but not InstSimplify.
+
+define i1 @add_nsw_pos_const6(i32 %x) {
+; CHECK-LABEL: @add_nsw_pos_const6(
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 %x, 42
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[ADD]], -2147483606
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = add nsw i32 %x, 42
+  %cmp = icmp eq i32 %add, -2147483606
+  ret i1 %cmp
+}
+
+; Verify that vectors work too.
+
+define <2 x i1> @add_nsw_pos_const5_splat_vec(<2 x i32> %x) {
+; CHECK-LABEL: @add_nsw_pos_const5_splat_vec(
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
+;
+  %add = add nsw <2 x i32> %x, <i32 42, i32 42>
+  %cmp = icmp ne <2 x i32> %add, <i32 -2147483607, i32 -2147483607>
+  ret <2 x i1> %cmp
+}
+
diff --git a/test/Transforms/InstSimplify/mul.ll b/test/Transforms/InstSimplify/mul.ll
new file mode 100644
index 000000000000..0bf8f699a686
--- /dev/null
+++ b/test/Transforms/InstSimplify/mul.ll
@@ -0,0 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+define <2 x i1> @test1(<2 x i1> %a) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %b = and <2 x i1> %a, <i1 true, i1 false>
+  %res = mul <2 x i1> %b, <i1 false, i1 true>
+  ret <2 x i1> %res
+}
diff --git a/test/Transforms/InstSimplify/rem.ll b/test/Transforms/InstSimplify/rem.ll
index c73d34346ded..b7f18f36b4b9 100644
--- a/test/Transforms/InstSimplify/rem.ll
+++ b/test/Transforms/InstSimplify/rem.ll
@@ -1,9 +1,63 @@
 ; NOTE: Assertions have been autogenerated by update_test_checks.py
 ; RUN: opt < %s -instsimplify -S | FileCheck %s
 
+; Division-by-zero is undef. UB in any vector lane means the whole op is undef.
+
+define <2 x i8> @srem_zero_elt_vec_constfold(<2 x i8> %x) {
+; CHECK-LABEL: @srem_zero_elt_vec_constfold(
+; CHECK-NEXT:    ret <2 x i8> undef
+;
+  %rem = srem <2 x i8> <i8 1, i8 2>, <i8 0, i8 -42>
+  ret <2 x i8> %rem
+}
+
+define <2 x i8> @urem_zero_elt_vec_constfold(<2 x i8> %x) {
+; CHECK-LABEL: @urem_zero_elt_vec_constfold(
+; CHECK-NEXT:    ret <2 x i8> undef
+;
+  %rem = urem <2 x i8> <i8 1, i8 2>, <i8 42, i8 0>
+  ret <2 x i8> %rem
+}
+
+define <2 x i8> @srem_zero_elt_vec(<2 x i8> %x) {
+; CHECK-LABEL: @srem_zero_elt_vec(
+; CHECK-NEXT:    ret <2 x i8> undef
+;
+  %rem = srem <2 x i8> %x, <i8 -42, i8 0>
+  ret <2 x i8> %rem
+}
+
+define <2 x i8> @urem_zero_elt_vec(<2 x i8> %x) {
+; CHECK-LABEL: @urem_zero_elt_vec(
+; CHECK-NEXT:    ret <2 x i8> undef
+;
+  %rem = urem <2 x i8> %x, <i8 0, i8 42>
+  ret <2 x i8> %rem
+}
+
+; Division-by-zero is undef. UB in any vector lane means the whole op is undef.
+; Thus, we can simplify this: if any element of 'y' is 0, we can do anything.
+; Therefore, assume that all elements of 'y' must be 1.
+
+define <2 x i1> @srem_bool_vec(<2 x i1> %x, <2 x i1> %y) {
+; CHECK-LABEL: @srem_bool_vec(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %rem = srem <2 x i1> %x, %y
+  ret <2 x i1> %rem
+}
+
+define <2 x i1> @urem_bool_vec(<2 x i1> %x, <2 x i1> %y) {
+; CHECK-LABEL: @urem_bool_vec(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %rem = urem <2 x i1> %x, %y
+  ret <2 x i1> %rem
+}
+
 define i32 @select1(i32 %x, i1 %b) {
 ; CHECK-LABEL: @select1(
-; CHECK:         ret i32 0
+; CHECK-NEXT:    ret i32 0
 ;
   %rhs = select i1 %b, i32 %x, i32 1
   %rem = srem i32 %x, %rhs
@@ -12,7 +66,7 @@ define i32 @select1(i32 %x, i1 %b) {
 
 define i32 @select2(i32 %x, i1 %b) {
 ; CHECK-LABEL: @select2(
-; CHECK:         ret i32 0
+; CHECK-NEXT:    ret i32 0
 ;
   %rhs = select i1 %b, i32 %x, i32 1
   %rem = urem i32 %x, %rhs
@@ -21,40 +75,40 @@ define i32 @select2(i32 %x, i1 %b) {
 
 define i32 @rem1(i32 %x, i32 %n) {
 ; CHECK-LABEL: @rem1(
-; CHECK:         [[MOD:%.*]] = srem i32 %x, %n
+; CHECK-NEXT:    [[MOD:%.*]] = srem i32 %x, %n
 ; CHECK-NEXT:    ret i32 [[MOD]]
 ;
- %mod = srem i32 %x, %n
- %mod1 = srem i32 %mod, %n
- ret i32 %mod1
+  %mod = srem i32 %x, %n
+  %mod1 = srem i32 %mod, %n
+  ret i32 %mod1
 }
 
 define i32 @rem2(i32 %x, i32 %n) {
 ; CHECK-LABEL: @rem2(
-; CHECK:         [[MOD:%.*]] = urem i32 %x, %n
+; CHECK-NEXT:    [[MOD:%.*]] = urem i32 %x, %n
 ; CHECK-NEXT:    ret i32 [[MOD]]
 ;
- %mod = urem i32 %x, %n
- %mod1 = urem i32 %mod, %n
- ret i32 %mod1
+  %mod = urem i32 %x, %n
+  %mod1 = urem i32 %mod, %n
+  ret i32 %mod1
 }
 
 define i32 @rem3(i32 %x, i32 %n) {
 ; CHECK-LABEL: @rem3(
-; CHECK:         [[MOD:%.*]] = srem i32 %x, %n
+; CHECK-NEXT:    [[MOD:%.*]] = srem i32 %x, %n
 ; CHECK-NEXT:    [[MOD1:%.*]] = urem i32 [[MOD]], %n
 ; CHECK-NEXT:    ret i32 [[MOD1]]
 ;
- %mod = srem i32 %x, %n
- %mod1 = urem i32 %mod, %n
- ret i32 %mod1
+  %mod = srem i32 %x, %n
+  %mod1 = urem i32 %mod, %n
+  ret i32 %mod1
 }
 
 declare i32 @external()
 
 define i32 @rem4() {
 ; CHECK-LABEL: @rem4(
-; CHECK:         [[CALL:%.*]] = call i32 @external(), !range !0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @external(), !range !0
 ; CHECK-NEXT:    ret i32 [[CALL]]
 ;
   %call = call i32 @external(), !range !0
diff --git a/test/Transforms/InstSimplify/select.ll b/test/Transforms/InstSimplify/select.ll
index 1acb5c469d37..cb2502cf63c9 100644
--- a/test/Transforms/InstSimplify/select.ll
+++ b/test/Transforms/InstSimplify/select.ll
@@ -402,7 +402,8 @@ define i32* @select_icmp_pointers(i32* %x, i32* %y) {
   ret i32* %sel
 }
 
-; FIXME: If the condition is known, we don't need to select.
+; If the condition is known, we don't need to select, but we're not
+; doing this fold here to avoid compile-time cost.
 
 declare void @llvm.assume(i1)
 
diff --git a/test/Transforms/InstSimplify/shift-knownbits.ll b/test/Transforms/InstSimplify/shift-knownbits.ll
index f50ea0582c6c..63b9b76fd22f 100644
--- a/test/Transforms/InstSimplify/shift-knownbits.ll
+++ b/test/Transforms/InstSimplify/shift-knownbits.ll
@@ -145,3 +145,46 @@ define i1 @shl_i1(i1 %a, i1 %b) {
   ret i1 %shl
 }
 
+; Simplify count leading/trailing zeros to zero if all valid bits are shifted out.
+
+declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
+declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
+declare <2 x i8> @llvm.cttz.v2i8(<2 x i8>, i1) nounwind readnone
+declare <2 x i8> @llvm.ctlz.v2i8(<2 x i8>, i1) nounwind readnone
+
+define i32 @lshr_ctlz_zero_is_undef(i32 %x) {
+; CHECK-LABEL: @lshr_ctlz_zero_is_undef(
+; CHECK-NEXT:    ret i32 0
+;
+  %ct = call i32 @llvm.ctlz.i32(i32 %x, i1 true)
+  %sh = lshr i32 %ct, 5
+  ret i32 %sh
+}
+
+define i32 @lshr_cttz_zero_is_undef(i32 %x) {
+; CHECK-LABEL: @lshr_cttz_zero_is_undef(
+; CHECK-NEXT:    ret i32 0
+;
+  %ct = call i32 @llvm.cttz.i32(i32 %x, i1 true)
+  %sh = lshr i32 %ct, 5
+  ret i32 %sh
+}
+
+define <2 x i8> @lshr_ctlz_zero_is_undef_splat_vec(<2 x i8> %x) {
+; CHECK-LABEL: @lshr_ctlz_zero_is_undef_splat_vec(
+; CHECK-NEXT:    ret <2 x i8> zeroinitializer
+;
+  %ct = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %x, i1 true)
+  %sh = lshr <2 x i8> %ct, <i8 3, i8 3>
+  ret <2 x i8> %sh
+}
+
+define <2 x i8> @lshr_cttz_zero_is_undef_splat_vec(<2 x i8> %x) {
+; CHECK-LABEL: @lshr_cttz_zero_is_undef_splat_vec(
+; CHECK-NEXT:    ret <2 x i8> zeroinitializer
+;
+  %ct = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %x, i1 true)
+  %sh = lshr <2 x i8> %ct, <i8 3, i8 3>
+  ret <2 x i8> %sh
+}
+
diff --git a/test/Transforms/InstSimplify/shufflevector.ll b/test/Transforms/InstSimplify/shufflevector.ll
new file mode 100644
index 000000000000..c6d180da293f
--- /dev/null
+++ b/test/Transforms/InstSimplify/shufflevector.ll
@@ -0,0 +1,212 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+define <4 x i32> @const_folding(<4 x i32> %x) {
+; CHECK-LABEL: @const_folding(
+; CHECK-NEXT:    ret <4 x i32> zeroinitializer
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> zeroinitializer, <4 x i32> <i32 5, i32 4, i32 5, i32 4>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @const_folding1(<4 x i32> %x) {
+; CHECK-LABEL: @const_folding1(
+; CHECK-NEXT:    ret <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+;
+  %shuf = shufflevector <4 x i32> <i32 5, i32 4, i32 5, i32 4>, <4 x i32> %x, <4 x i32> zeroinitializer
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @const_folding_negative(<3 x i32> %x) {
+; CHECK-LABEL: @const_folding_negative(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x i32> [[X:%.*]], <3 x i32> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 5, i32 4>
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
+  %shuf = shufflevector <3 x i32> %x, <3 x i32> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 5, i32 4>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @splat_operand(<4 x i32> %x) {
+; CHECK-LABEL: @splat_operand(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[SPLAT]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
+  %shuf = shufflevector <4 x i32> %splat, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @splat_operand1(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @splat_operand1(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[SPLAT]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> zeroinitializer
+  %shuf = shufflevector <4 x i32> %splat, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @splat_operand2(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @splat_operand2(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[SPLAT]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
+  %shuf = shufflevector <4 x i32> %splat, <4 x i32> %y, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @splat_operand3(<4 x i32> %x) {
+; CHECK-LABEL: @splat_operand3(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[SPLAT]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
+  %shuf = shufflevector <4 x i32> zeroinitializer, <4 x i32> %splat, <4 x i32> <i32 7, i32 6, i32 5, i32 5>
+  ret <4 x i32> %shuf
+}
+
+define <8 x i32> @splat_operand_negative(<4 x i32> %x) {
+; CHECK-LABEL: @splat_operand_negative(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[SPLAT]], <4 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <8 x i32> [[SHUF]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
+  %shuf = shufflevector <4 x i32> %splat, <4 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i32> %shuf
+}
+
+define <4 x i32> @splat_operand_negative2(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @splat_operand_negative2(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[SPLAT]], <4 x i32> [[Y:%.*]], <4 x i32> <i32 0, i32 3, i32 4, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
+  %shuf = shufflevector <4 x i32> %splat, <4 x i32> %y, <4 x i32> <i32 0, i32 3, i32 4, i32 1>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @splat_operand_negative3(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @splat_operand_negative3(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> [[SPLAT]], <4 x i32> <i32 0, i32 3, i32 4, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
+  %shuf = shufflevector <4 x i32> %y, <4 x i32> %splat, <4 x i32> <i32 0, i32 3, i32 4, i32 1>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @splat_operand_negative4(<4 x i32> %x) {
+; CHECK-LABEL: @splat_operand_negative4(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef>
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[SPLAT]], <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef>
+  %shuf = shufflevector <4 x i32> %splat, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @undef_mask(<4 x i32> %x) {
+; CHECK-LABEL: @undef_mask(
+; CHECK-NEXT:    ret <4 x i32> undef
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> undef
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @identity_mask_0(<4 x i32> %x) {
+; CHECK-LABEL: @identity_mask_0(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @identity_mask_1(<4 x i32> %x) {
+; CHECK-LABEL: @identity_mask_1(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> undef, <4 x i32> [[X:%.*]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
+  %shuf = shufflevector <4 x i32> undef, <4 x i32> %x, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @pseudo_identity_mask(<4 x i32> %x) {
+; CHECK-LABEL: @pseudo_identity_mask(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> [[X]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @const_operand(<4 x i32> %x) {
+; CHECK-LABEL: @const_operand(
+; CHECK-NEXT:    ret <4 x i32> <i32 42, i32 45, i32 44, i32 43>
+;
+  %shuf = shufflevector <4 x i32> <i32 42, i32 43, i32 44, i32 45>, <4 x i32> %x, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @merge(<4 x i32> %x) {
+; CHECK-LABEL: @merge(
+; CHECK-NEXT:    [[LOWER:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[UPPER:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[MERGED:%.*]] = shufflevector <2 x i32> [[UPPER]], <2 x i32> [[LOWER]], <4 x i32> <i32 3, i32 2, i32 0, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[MERGED]]
+;
+  %lower = shufflevector <4 x i32> %x, <4 x i32> undef, <2 x i32> <i32 1, i32 0>
+  %upper = shufflevector <4 x i32> %x, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %merged = shufflevector <2 x i32> %upper, <2 x i32> %lower, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
+  ret <4 x i32> %merged
+}
+
+define <8 x double> @extract_and_concat(<8 x double> %x) {
+; CHECK-LABEL: @extract_and_concat(
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x double> [[X:%.*]], <8 x double> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <8 x double> [[X]], <8 x double> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[S3:%.*]] = shufflevector <8 x double> [[X]], <8 x double> undef, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    [[S4:%.*]] = shufflevector <8 x double> [[X]], <8 x double> undef, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:    [[S5:%.*]] = shufflevector <2 x double> [[S1]], <2 x double> [[S2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[S6:%.*]] = shufflevector <2 x double> [[S3]], <2 x double> [[S4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[S7:%.*]] = shufflevector <4 x double> [[S5]], <4 x double> [[S6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x double> [[S7]]
+;
+  %s1 = shufflevector <8 x double> %x, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+  %s2 = shufflevector <8 x double> %x, <8 x double> undef, <2 x i32> <i32 2, i32 3>
+  %s3 = shufflevector <8 x double> %x, <8 x double> undef, <2 x i32> <i32 4, i32 5>
+  %s4 = shufflevector <8 x double> %x, <8 x double> undef, <2 x i32> <i32 6, i32 7>
+  %s5 = shufflevector <2 x double> %s1, <2 x double> %s2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s6 = shufflevector <2 x double> %s3, <2 x double> %s4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s7 = shufflevector <4 x double> %s5, <4 x double> %s6, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %s7
+}
+
+; This case has intermediate lane crossings.
+
+define <8 x i64> @PR30630(<8 x i64> %x) {
+; CHECK-LABEL: @PR30630(
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x i64> [[X:%.*]], <8 x i64> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <8 x i64> [[X]], <8 x i64> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-NEXT:    [[S3:%.*]] = shufflevector <8 x i64> [[X]], <8 x i64> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-NEXT:    [[S4:%.*]] = shufflevector <8 x i64> [[X]], <8 x i64> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-NEXT:    [[S5:%.*]] = shufflevector <2 x i64> [[S1]], <2 x i64> [[S2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[S6:%.*]] = shufflevector <2 x i64> [[S3]], <2 x i64> [[S4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[S7:%.*]] = shufflevector <4 x i64> [[S5]], <4 x i64> [[S6]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    ret <8 x i64> [[S7]]
+;
+  %s1 = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> <i32 0, i32 4>
+  %s2 = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> <i32 1, i32 5>
+  %s3 = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> <i32 2, i32 6>
+  %s4 = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> <i32 3, i32 7>
+  %s5 = shufflevector <2 x i64> %s1, <2 x i64> %s2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s6 = shufflevector <2 x i64> %s3, <2 x i64> %s4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s7 = shufflevector <4 x i64> %s5, <4 x i64> %s6, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  ret <8 x i64> %s7
+}
+
diff --git a/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses.ll b/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses.ll
index 2a257d490815..a038fd1a411b 100644
--- a/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses.ll
+++ b/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses.ll
@@ -565,3 +565,212 @@ define void @no_interleave(<4 x float> %a0) {
   store <4 x float> %v0, <4 x float>* @g, align 16
   ret void
 }
+
+define void @load_factor2_wide2(<16 x i32>* %ptr) {
+; NEON-LABEL:    @load_factor2_wide2(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
+; NEON-NEXT:       [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; NEON-NEXT:       [[LDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP6]])
+; NEON-NEXT:       [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 1
+; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 0
+; NEON-NEXT:       [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_factor2_wide2(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
+  %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret void
+}
+
+define void @load_factor2_wide3(<24 x i32>* %ptr) {
+; NEON-LABEL:    @load_factor2_wide3(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
+; NEON-NEXT:       [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; NEON-NEXT:       [[LDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP6]])
+; NEON-NEXT:       [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 1
+; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 0
+; NEON-NEXT:       [[TMP9:%.*]] = getelementptr i32, i32* [[TMP5]], i32 8
+; NEON-NEXT:       [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; NEON-NEXT:       [[LDN2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP10]])
+; NEON-NEXT:       [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN2]], 1
+; NEON-NEXT:       [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN2]], 0
+; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; NEON-NEXT:       [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP14]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; NEON-NEXT:       [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP16]], <8 x i32> [[TMP17]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_factor2_wide3(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
+  %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22>
+  %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23>
+  ret void
+}
+
+define void @load_factor3_wide(<24 x i32>* %ptr) {
+; NEON-LABEL: @load_factor3_wide(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
+; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
+; NEON-NEXT:       [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
+; NEON-NEXT:       [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; NEON-NEXT:       [[LDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP7]])
+; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 2
+; NEON-NEXT:       [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 1
+; NEON-NEXT:       [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 0
+; NEON-NEXT:       [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_factor3_wide(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
+  %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+  %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+  %v2 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  ret void
+}
+
+define void @load_factor4_wide(<32 x i32>* %ptr) {
+; NEON-LABEL: @load_factor4_wide(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 3
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
+; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
+; NEON-NEXT:       [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
+; NEON-NEXT:       [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
+; NEON-NEXT:       [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; NEON-NEXT:       [[LDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP8]])
+; NEON-NEXT:       [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 3
+; NEON-NEXT:       [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 2
+; NEON-NEXT:       [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 1
+; NEON-NEXT:       [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 0
+; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_factor4_wide(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <32 x i32>, <32 x i32>* %ptr, align 4
+  %v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %v1 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %v2 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %v3 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  ret void
+}
+
+define void @store_factor2_wide(<16 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1) {
+; NEON-LABEL:    @store_factor2_wide(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       [[TMP4:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st2.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]])
+; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP6:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; NEON-NEXT:       [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; NEON-NEXT:       [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st2.v4i32.p0v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32>* [[TMP8]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_factor2_wide(
+; NO_NEON:         ret void
+;
+  %interleaved.vec = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_factor3_wide(<24 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) {
+; NEON-LABEL:    @store_factor3_wide(
+; NEON:            [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; NEON-NEXT:       [[TMP5:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]])
+; NEON-NEXT:       [[TMP6:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP7:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; NEON-NEXT:       [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; NEON-NEXT:       [[TMP9:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
+; NEON-NEXT:       [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32>* [[TMP10]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_factor3_wide(
+; NO_NEON:         ret void
+;
+  %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x i32> %v2, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+  store <24 x i32> %interleaved.vec, <24 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_factor4_wide(<32 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2, <8 x i32> %v3) {
+; NEON-LABEL:    @store_factor4_wide(
+; NEON:            [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]])
+; NEON-NEXT:       [[TMP7:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; NEON-NEXT:       [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; NEON-NEXT:       [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; NEON-NEXT:       [[TMP11:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
+; NEON-NEXT:       [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v4i32.p0v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_factor4_wide(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+  store <32 x i32> %interleaved.vec, <32 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @load_factor2_fp128(<4 x fp128>* %ptr) {
+; NEON-LABEL:    @load_factor2_fp128(
+; NEON-NOT:        @llvm.aarch64.neon
+; NEON:            ret void
+; NO_NEON-LABEL: @load_factor2_fp128(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <4 x fp128>, <4 x fp128>* %ptr, align 16
+  %v0 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> <i32 0, i32 2>
+  %v1 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> <i32 1, i32 3>
+  ret void
+}
diff --git a/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll b/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll
index 21eb8d7a1b0a..5938f9d7321d 100644
--- a/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll
+++ b/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -mattr=+neon -interleaved-access -S | FileCheck %s -check-prefix=NEON
-; RUN: opt < %s -interleaved-access -S | FileCheck %s -check-prefix=NO_NEON
+; RUN: opt < %s -mattr=+neon -interleaved-access -S | FileCheck %s -check-prefixes=NEON,ALL
+; RUN: opt < %s -interleaved-access -S | FileCheck %s -check-prefixes=NO_NEON,ALL
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
 target triple = "arm---eabi"
@@ -387,13 +387,31 @@ define void @store_address_space(<4 x i32> addrspace(1)* %ptr, <2 x i32> %v0, <2
   ret void
 }
 
+define void @load_f16_factor2(<8 x half>* %ptr) {
+; ALL-LABEL: @load_f16_factor2(
+; ALL-NOT:     @llvm.arm.neon
+; ALL:         ret void
+;
+  %interleaved.vec = load <8 x half>, <8 x half>* %ptr, align 4
+  %v0 = shufflevector <8 x half> %interleaved.vec, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %v1 = shufflevector <8 x half> %interleaved.vec, <8 x half> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret void
+}
+
+define void @store_f16_factor2(<8 x half>* %ptr, <4 x half> %v0, <4 x half> %v1) {
+; ALL-LABEL: @store_f16_factor2(
+; ALL-NOT:     @llvm.arm.neon
+; ALL:         ret void
+;
+  %interleaved.vec = shufflevector <4 x half> %v0, <4 x half> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x half> %interleaved.vec, <8 x half>* %ptr, align 4
+  ret void
+}
+
 define void @load_illegal_factor2(<3 x float>* %ptr) nounwind {
-; NEON-LABEL:    @load_illegal_factor2(
-; NEON-NOT:        @llvm.arm.neon
-; NEON:            ret void
-; NO_NEON-LABEL: @load_illegal_factor2(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; ALL-LABEL:    @load_illegal_factor2(
+; ALL-NOT:        @llvm.arm.neon
+; ALL:            ret void
 ;
   %interleaved.vec = load <3 x float>, <3 x float>* %ptr, align 16
   %v0 = shufflevector <3 x float> %interleaved.vec, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
@@ -401,12 +419,9 @@ define void @load_illegal_factor2(<3 x float>* %ptr) nounwind {
 }
 
 define void @store_illegal_factor2(<3 x float>* %ptr, <3 x float> %v0) nounwind {
-; NEON-LABEL:    @store_illegal_factor2(
-; NEON-NOT:        @llvm.arm.neon
-; NEON:            ret void
-; NO_NEON-LABEL: @store_illegal_factor2(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; ALL-LABEL: @store_illegal_factor2(
+; ALL-NOT:     @llvm.arm.neon
+; ALL:         ret void
 ;
   %interleaved.vec = shufflevector <3 x float> %v0, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
   store <3 x float> %interleaved.vec, <3 x float>* %ptr, align 16
@@ -538,12 +553,9 @@ define void @store_general_mask_factor3_undefmultimid(<12 x i32>* %ptr, <32 x i3
 }
 
 define void @store_general_mask_factor3_undef_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor3_undef_fail(
-; NEON-NOT:        @llvm.arm.neon
-; NEON:            ret void
-; NO_NEON-LABEL: @store_general_mask_factor3_undef_fail(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; ALL-LABEL: @store_general_mask_factor3_undef_fail(
+; ALL-NOT:     @llvm.arm.neon
+; ALL:         ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 8, i32 35, i32 19>
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
@@ -568,12 +580,9 @@ define void @store_general_mask_factor3_undeflane(<12 x i32>* %ptr, <32 x i32> %
 }
 
 define void @store_general_mask_factor3_endstart_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor3_endstart_fail(
-; NEON-NOT:        @llvm.arm.neon
-; NEON:            ret void
-; NO_NEON-LABEL: @store_general_mask_factor3_endstart_fail(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; ALL-LABEL:    @store_general_mask_factor3_endstart_fail(
+; ALL-NOT:        @llvm.arm.neon
+; ALL:            ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 2, i32 35, i32 19>
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
@@ -598,12 +607,9 @@ define void @store_general_mask_factor3_endstart_pass(<12 x i32>* %ptr, <32 x i3
 }
 
 define void @store_general_mask_factor3_midstart_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor3_midstart_fail(
-; NEON-NOT:        @llvm.arm.neon
-; NEON:            ret void
-; NO_NEON-LABEL: @store_general_mask_factor3_midstart_fail(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; ALL-LABEL:    @store_general_mask_factor3_midstart_fail(
+; ALL-NOT:        @llvm.arm.neon
+; ALL:            ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 0, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
@@ -630,17 +636,221 @@ define void @store_general_mask_factor3_midstart_pass(<12 x i32>* %ptr, <32 x i3
 @g = external global <4 x float>
 
 ; The following does not give a valid interleaved store
-; NEON-LABEL: define void @no_interleave
-; NEON-NOT: call void @llvm.arm.neon.vst2
-; NEON: shufflevector
-; NEON: store
-; NEON: ret void
-; NO_NEON-LABEL: define void @no_interleave
-; NO_NEON: shufflevector
-; NO_NEON: store
-; NO_NEON: ret void
+; ALL-LABEL: define void @no_interleave
+; ALL-NOT: call void @llvm.arm.neon.vst2
+; ALL: shufflevector
+; ALL: store
+; ALL: ret void
 define void @no_interleave(<4 x float> %a0) {
   %v0 = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 0, i32 7, i32 1, i32 undef>
   store <4 x float> %v0, <4 x float>* @g, align 16
   ret void
 }
+
+define void @load_factor2_wide2(<16 x i32>* %ptr) {
+; NEON-LABEL:    @load_factor2_wide2(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
+; NEON-NEXT:       [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
+; NEON-NEXT:       [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP6]], i32 4)
+; NEON-NEXT:       [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; NEON-NEXT:       [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_factor2_wide2(
+; NO_NEON-NOT:     @llvm.arm.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
+  %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret void
+}
+
+define void @load_factor2_wide3(<24 x i32>* %ptr) {
+; NEON-LABEL:    @load_factor2_wide3(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
+; NEON-NEXT:       [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
+; NEON-NEXT:       [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP6]], i32 4)
+; NEON-NEXT:       [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; NEON-NEXT:       [[TMP9:%.*]] = getelementptr i32, i32* [[TMP5]], i32 8
+; NEON-NEXT:       [[TMP10:%.*]] = bitcast i32* [[TMP9]] to i8*
+; NEON-NEXT:       [[VLDN2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP10]], i32 4)
+; NEON-NEXT:       [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 1
+; NEON-NEXT:       [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 0
+; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; NEON-NEXT:       [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP14]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; NEON-NEXT:       [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP16]], <8 x i32> [[TMP17]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_factor2_wide3(
+; NO_NEON-NOT:     @llvm.arm.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
+  %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22>
+  %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23>
+  ret void
+}
+
+define void @load_factor3_wide(<24 x i32>* %ptr) {
+; NEON-LABEL:    @load_factor3_wide(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
+; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
+; NEON-NEXT:       [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
+; NEON-NEXT:       [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8*
+; NEON-NEXT:       [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP7]], i32 4)
+; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2
+; NEON-NEXT:       [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; NEON-NEXT:       [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; NEON-NEXT:       [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_factor3_wide(
+; NO_NEON-NOT:     @llvm.arm.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
+  %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+  %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+  %v2 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  ret void
+}
+
+define void @load_factor4_wide(<32 x i32>* %ptr) {
+; NEON-LABEL:    @load_factor4_wide(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
+; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
+; NEON-NEXT:       [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
+; NEON-NEXT:       [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
+; NEON-NEXT:       [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
+; NEON-NEXT:       [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP8]], i32 4)
+; NEON-NEXT:       [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 3
+; NEON-NEXT:       [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2
+; NEON-NEXT:       [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; NEON-NEXT:       [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_factor4_wide(
+; NO_NEON-NOT:     @llvm.arm.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <32 x i32>, <32 x i32>* %ptr, align 4
+  %v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %v1 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %v2 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %v3 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  ret void
+}
+
+define void @store_factor2_wide(<16 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1) {
+; NEON-LABEL:    @store_factor2_wide(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
+; NEON-NEXT:       [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
+; NEON-NEXT:       [[TMP7:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP8:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; NEON-NEXT:       call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 4)
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_factor2_wide(
+; NO_NEON-NOT:     @llvm.arm.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_factor3_wide(<24 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) {
+; NEON-LABEL:    @store_factor3_wide(
+; NEON:            [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
+; NEON-NEXT:       [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
+; NEON-NEXT:       [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8*
+; NEON-NEXT:       [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; NEON-NEXT:       [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 4)
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_factor3_wide(
+; NO_NEON-NOT:     @llvm.arm.neon
+; NO_NEON:         ret void
+;
+  %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x i32> %v2, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+  store <24 x i32> %interleaved.vec, <24 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_factor4_wide(<32 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2, <8 x i32> %v3) {
+; NEON-LABEL:    @store_factor4_wide(
+; NEON:            [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; NEON-NEXT:       [[TMP6:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], i32 4)
+; NEON-NEXT:       [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
+; NEON-NEXT:       [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
+; NEON-NEXT:       [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; NEON-NEXT:       [[TMP11:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; NEON-NEXT:       [[TMP12:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_factor4_wide(
+; NO_NEON-NOT:     @llvm.arm.neon
+; NO_NEON:         ret void
+;
+  %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+  store <32 x i32> %interleaved.vec, <32 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @load_factor2_fp128(<4 x fp128>* %ptr) {
+; ALL-LABEL: @load_factor2_fp128(
+; ALL-NOT:     @llvm.arm.neon
+; ALL:         ret void
+;
+  %interleaved.vec = load <4 x fp128>, <4 x fp128>* %ptr, align 16
+  %v0 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> <i32 0, i32 2>
+  %v1 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> <i32 1, i32 3>
+  ret void
+}
diff --git a/test/Transforms/JumpThreading/guards.ll b/test/Transforms/JumpThreading/guards.ll
new file mode 100644
index 000000000000..eac2b5dcd85f
--- /dev/null
+++ b/test/Transforms/JumpThreading/guards.ll
@@ -0,0 +1,183 @@
+; RUN: opt < %s -jump-threading -dce -S | FileCheck %s
+
+declare void @llvm.experimental.guard(i1, ...)
+
+declare i32 @f1()
+declare i32 @f2()
+
+define i32 @branch_implies_guard(i32 %a) {
+; CHECK-LABEL: @branch_implies_guard(
+  %cond = icmp slt i32 %a, 10
+  br i1 %cond, label %T1, label %F1
+
+T1:
+; CHECK:       T1.split
+; CHECK:         %v1 = call i32 @f1()
+; CHECK-NEXT:    %retVal
+; CHECK-NEXT:    br label %Merge
+  %v1 = call i32 @f1()
+  br label %Merge
+
+F1:
+; CHECK:       F1.split
+; CHECK:         %v2 = call i32 @f2()
+; CHECK-NEXT:    %retVal
+; CHECK-NEXT:    %condGuard
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %condGuard
+; CHECK-NEXT:    br label %Merge
+  %v2 = call i32 @f2()
+  br label %Merge
+
+Merge:
+; CHECK:       Merge
+; CHECK-NOT:     call void(i1, ...) @llvm.experimental.guard(
+  %retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ]
+  %retVal = add i32 %retPhi, 10
+  %condGuard = icmp slt i32 %a, 20
+  call void(i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ]
+  ret i32 %retVal
+}
+
+define i32 @not_branch_implies_guard(i32 %a) {
+; CHECK-LABEL: @not_branch_implies_guard(
+  %cond = icmp slt i32 %a, 20
+  br i1 %cond, label %T1, label %F1
+
+T1:
+; CHECK:       T1.split:
+; CHECK-NEXT:    %v1 = call i32 @f1()
+; CHECK-NEXT:    %retVal
+; CHECK-NEXT:    %condGuard
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %condGuard
+; CHECK-NEXT:    br label %Merge
+  %v1 = call i32 @f1()
+  br label %Merge
+
+F1:
+; CHECK:       F1.split:
+; CHECK-NEXT:   %v2 = call i32 @f2()
+; CHECK-NEXT:   %retVal
+; CHECK-NEXT:   br label %Merge
+  %v2 = call i32 @f2()
+  br label %Merge
+
+Merge:
+; CHECK:       Merge
+; CHECK-NOT:     call void(i1, ...) @llvm.experimental.guard(
+  %retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ]
+  %retVal = add i32 %retPhi, 10
+  %condGuard = icmp sgt i32 %a, 10
+  call void(i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ]
+  ret i32 %retVal
+}
+
+define i32 @branch_overlaps_guard(i32 %a) {
+; CHECK-LABEL: @branch_overlaps_guard(
+  %cond = icmp slt i32 %a, 20
+  br i1 %cond, label %T1, label %F1
+
+T1:
+; CHECK:        T1:
+; CHECK-NEXT:      %v1 = call i32 @f1()
+; CHECK-NEXT:      br label %Merge
+  %v1 = call i32 @f1()
+  br label %Merge
+
+F1:
+; CHECK:        F1:
+; CHECK-NEXT:     %v2 = call i32 @f2()
+; CHECK-NEXT:     br label %Merge
+  %v2 = call i32 @f2()
+  br label %Merge
+
+Merge:
+; CHECK:       Merge
+; CHECK:         %condGuard = icmp slt i32 %a, 10
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ]
+  %retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ]
+  %retVal = add i32 %retPhi, 10
+  %condGuard = icmp slt i32 %a, 10
+  call void(i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ]
+  ret i32 %retVal
+}
+
+define i32 @branch_doesnt_overlap_guard(i32 %a) {
+; CHECK-LABEL: @branch_doesnt_overlap_guard(
+  %cond = icmp slt i32 %a, 10
+  br i1 %cond, label %T1, label %F1
+
+T1:
+; CHECK:        T1:
+; CHECK-NEXT:      %v1 = call i32 @f1()
+; CHECK-NEXT:      br label %Merge
+  %v1 = call i32 @f1()
+  br label %Merge
+
+F1:
+; CHECK:        F1:
+; CHECK-NEXT:     %v2 = call i32 @f2()
+; CHECK-NEXT:     br label %Merge
+  %v2 = call i32 @f2()
+  br label %Merge
+
+Merge:
+; CHECK:       Merge
+; CHECK:         %condGuard = icmp sgt i32 %a, 20
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ]
+  %retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ]
+  %retVal = add i32 %retPhi, 10
+  %condGuard = icmp sgt i32 %a, 20
+  call void(i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ]
+  ret i32 %retVal
+}
+
+define i32 @not_a_diamond1(i32 %a, i1 %cond1) {
+; CHECK-LABEL: @not_a_diamond1(
+  br i1 %cond1, label %Pred, label %Exit
+
+Pred:
+; CHECK:       Pred:
+; CHECK-NEXT:    switch i32 %a, label %Exit
+  switch i32 %a, label %Exit [
+    i32 10, label %Merge
+    i32 20, label %Merge
+  ]
+
+Merge:
+; CHECK:       Merge:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %cond1) [ "deopt"() ]
+; CHECK-NEXT:    br label %Exit
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond1) [ "deopt"() ]
+  br label %Exit
+
+Exit:
+; CHECK:       Exit:
+; CHECK-NEXT:    ret i32 %a
+  ret i32 %a
+}
+
+define void @not_a_diamond2(i32 %a, i1 %cond1) {
+; CHECK-LABEL: @not_a_diamond2(
+  br label %Parent
+
+Merge:
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond1)[ "deopt"() ]
+  ret void
+
+Pred:
+; CHECK-NEXT:  Pred:
+; CHECK-NEXT:    switch i32 %a, label %Exit
+  switch i32 %a, label %Exit [
+    i32 10, label %Merge
+    i32 20, label %Merge
+  ]
+
+Parent:
+  br label %Pred
+
+Exit:
+; CHECK:       Merge:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %cond1) [ "deopt"() ]
+; CHECK-NEXT:    ret void
+  ret void
+}
diff --git a/test/Transforms/JumpThreading/thread-loads.ll b/test/Transforms/JumpThreading/thread-loads.ll
index f54672d19566..3606e796cdd5 100644
--- a/test/Transforms/JumpThreading/thread-loads.ll
+++ b/test/Transforms/JumpThreading/thread-loads.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -jump-threading -S | FileCheck %s
-; RUN: opt < %s -passes=jump-threading -S | FileCheck %s
+; RUN: opt < %s -aa-pipeline=basic-aa -passes=jump-threading -S | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin7"
@@ -302,6 +302,229 @@ ret2:
   ret void
 }
 
+define i32 @fn_noalias(i1 %c2,i64* noalias %P, i64* noalias %P2) {
+; CHECK-LABEL: @fn_noalias
+; CHECK-LABEL: cond1:
+; CHECK: %[[LD1:.*]] = load i64, i64* %P
+; CHECK: br i1 %c, label %[[THREAD:.*]], label %end
+; CHECK-LABEL: cond2:
+; CHECK: %[[LD2:.*]] = load i64, i64* %P
+; CHECK-LABEL: cond3:
+; CHECK: %[[PHI:.*]] = phi i64 [ %[[LD1]], %[[THREAD]] ], [ %[[LD2]], %cond2 ]
+; CHECK: call void @fn3(i64 %[[PHI]])
+entry:
+  br i1 %c2, label %cond2, label %cond1
+
+cond1:
+  %l1 = load i64, i64* %P
+  store i64 42, i64* %P2
+  %c = icmp eq i64 %l1, 0
+  br i1 %c, label %cond2, label %end
+
+cond2:
+  %l2 = load i64, i64* %P
+  call void @fn2(i64 %l2)
+  %c3 = icmp eq i64 %l2,  0
+  br i1 %c3, label %cond3, label %end
+
+cond3:
+  call void @fn3(i64 %l2)
+  br label %end
+
+end:
+  ret i32 0
+}
+
+; This tests if we can thread from %sw.bb.i to %do.body.preheader.i67 through
+; %sw.bb21.i. To make this happen, %l2 should be detected as a partically
+; redundant load with %l3 across the store to %phase in %sw.bb21.i.
+
+%struct.NEXT_MOVE = type { i32, i32, i32* }
+@hash_move = unnamed_addr global [65 x i32] zeroinitializer, align 4
+@current_move = internal global [65 x i32] zeroinitializer, align 4
+@last = internal unnamed_addr global [65 x i32*] zeroinitializer, align 8
+@next_status = internal unnamed_addr global [65 x %struct.NEXT_MOVE] zeroinitializer, align 8
+define fastcc i32 @Search(i64 %idxprom.i, i64 %idxprom.i89, i32 %c) {
+; CHECK-LABEL: @Search
+; CHECK-LABEL: sw.bb.i:
+; CHECK: %[[LD1:.*]] = load i32, i32* %arrayidx185, align 4
+; CHECK: %[[C1:.*]] = icmp eq i32 %[[LD1]], 0
+; CHECK: br i1 %[[C1]], label %sw.bb21.i.thread, label %if.then.i64
+; CHECK-LABEL: sw.bb21.i.thread:
+; CHECK: br label %[[THREAD_TO:.*]]
+; CHECK-LABEL: sw.bb21.i:
+; CHECK: %[[LD2:.*]] = load i32, i32* %arrayidx185, align 4
+; CHECK: %[[C2:.*]] = icmp eq i32 %[[LD2]], 0
+; CHECK:br i1 %[[C2]], label %[[THREAD_TO]], label %cleanup
+entry:
+  %arrayidx185 = getelementptr inbounds [65 x i32], [65 x i32]* @hash_move, i64 0, i64 %idxprom.i
+  %arrayidx307 = getelementptr inbounds [65 x i32], [65 x i32]* @current_move, i64 0, i64 %idxprom.i
+  %arrayidx89 = getelementptr inbounds [65 x i32*], [65 x i32*]* @last, i64 0, i64 %idxprom.i
+  %phase = getelementptr inbounds [65 x %struct.NEXT_MOVE], [65 x %struct.NEXT_MOVE]* @next_status, i64 0, i64 %idxprom.i, i32 0
+  br label %cond.true282
+
+cond.true282:
+  switch i32 %c, label %sw.default.i [
+    i32 1, label %sw.bb.i
+    i32 0, label %sw.bb21.i
+  ]
+
+sw.default.i:
+  br label %cleanup
+
+sw.bb.i:
+  %call.i62 = call fastcc i32* @GenerateCheckEvasions()
+  store i32* %call.i62, i32** %arrayidx89, align 8
+  %l2 = load i32, i32* %arrayidx185, align 4
+  %tobool.i63 = icmp eq i32 %l2, 0
+  br i1 %tobool.i63, label %sw.bb21.i, label %if.then.i64
+
+if.then.i64:                                      ; preds = %sw.bb.i
+  store i32 7, i32* %phase, align 8
+  store i32 %l2, i32* %arrayidx307, align 4
+  %call16.i = call fastcc i32 @ValidMove(i32 %l2)
+  %tobool17.i = icmp eq i32 %call16.i, 0
+  br i1 %tobool17.i, label %if.else.i65, label %cleanup
+
+if.else.i65:
+  call void @f65()
+  br label %sw.bb21.i
+
+sw.bb21.i:
+  store i32 10, i32* %phase, align 8
+  %l3= load i32, i32* %arrayidx185, align 4
+  %tobool27.i = icmp eq i32 %l3, 0
+  br i1 %tobool27.i, label %do.body.preheader.i67, label %cleanup
+
+do.body.preheader.i67:
+  call void @f67()
+  ret  i32 67
+
+cleanup:
+  call void @Cleanup()
+  ret  i32 0
+}
+
+declare fastcc i32* @GenerateCheckEvasions()
+declare fastcc i32 @ValidMove(i32 %move)
+declare void @f67()
+declare void @Cleanup()
+declare void @f65()
+
+define i32 @fn_SinglePred(i1 %c2,i64* %P) {
+; CHECK-LABEL: @fn_SinglePred
+; CHECK-LABEL: entry:
+; CHECK: %[[L1:.*]] = load i64, i64* %P
+; CHECK: br i1 %c, label %cond3, label %cond1
+; CHECK-LABEL: cond2:
+; CHECK-NOT: load
+; CHECK: %[[PHI:.*]] = phi i64 [ %[[L1]], %cond1 ]
+; CHECK: call void @fn2(i64 %[[PHI]])
+; CHECK: br label %end
+; CHECK-LABEL: cond3:
+; CHECK: call void @fn2(i64 %l1)
+; CHECK: call void @fn3(i64 %l1)
+
+entry:
+  %l1 = load i64, i64* %P
+  %c = icmp eq i64 %l1, 0
+  br i1 %c, label %cond2, label %cond1
+
+cond1:
+  br i1 %c2, label %cond2, label %end
+
+cond2:
+  %l2 = load i64, i64* %P
+  call void @fn2(i64 %l2)
+  %c3 = icmp eq i64 %l2,  0
+  br i1 %c3, label %cond3, label %end
+
+cond3:
+  call void @fn3(i64 %l2)
+  br label %end
+
+end:
+  ret i32 0
+}
+
+define i32 @fn_SinglePredMultihop(i1 %c1, i1 %c2,i64* %P) {
+; CHECK-LABEL: @fn_SinglePredMultihop
+; CHECK-LABEL: entry:
+; CHECK: %[[L1:.*]] = load i64, i64* %P
+; CHECK: br i1 %c0, label %cond3, label %cond0
+; CHECK-LABEL: cond2:
+; CHECK-NOT: load
+; CHECK: %[[PHI:.*]] = phi i64 [ %[[L1]], %cond1 ]
+; CHECK: call void @fn2(i64 %[[PHI]])
+; CHECK: br label %end
+; CHECK-LABEL: cond3:
+; CHECK: call void @fn2(i64 %l1)
+; CHECK: call void @fn3(i64 %l1)
+
+entry:
+  %l1 = load i64, i64* %P
+  %c0 = icmp eq i64 %l1, 0
+  br i1 %c0, label %cond2, label %cond0
+
+cond0:
+  br i1 %c1, label %cond1, label %end
+
+cond1:
+  br i1 %c2, label %cond2, label %end
+
+cond2:
+  %l2 = load i64, i64* %P
+  call void @fn2(i64 %l2)
+  %c3 = icmp eq i64 %l2,  0
+  br i1 %c3, label %cond3, label %end
+
+cond3:
+  call void @fn3(i64 %l2)
+  br label %end
+
+end:
+  ret i32 0
+}
+
+declare void @fn2(i64)
+declare void @fn3(i64)
+
+
+; Make sure we phi-translate and make the partially redundant load in
+; merge fully redudant and then we can jump-thread the block with the
+; store.
+;
+; CHECK-LABEL: define i32 @phi_translate_partial_redundant_loads(i32, i32*, i32*
+; CHECK: merge.thread:
+; CHECK: store
+; CHECK: br label %left_x
+;
+; CHECK: left_x:
+; CHECK-NEXT: ret i32 20
+define i32 @phi_translate_partial_redundant_loads(i32, i32*, i32*)  {
+  %cmp0 = icmp ne i32 %0, 0
+  br i1 %cmp0, label %left, label %right
+
+left:
+  store i32 1, i32* %1, align 4
+  br label %merge
+
+right:
+  br label %merge
+
+merge:
+  %phiptr = phi i32* [ %1, %left ], [ %2, %right ]
+  %newload = load i32, i32* %phiptr, align 4
+  %cmp1 = icmp slt i32 %newload, 5
+  br i1 %cmp1, label %left_x, label %right_x
+
+left_x:
+  ret i32 20
+
+right_x:
+  ret i32 10
+}
+
 !0 = !{!3, !3, i64 0}
 !1 = !{!"omnipotent char", !2}
 !2 = !{!"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LICM/atomics.ll b/test/Transforms/LICM/atomics.ll
index d23cb49c5486..15c461aeca27 100644
--- a/test/Transforms/LICM/atomics.ll
+++ b/test/Transforms/LICM/atomics.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -S -basicaa -licm | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='lcssa,require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
 
 ; Check that we can hoist unordered loads
 define i32 @test1(i32* nocapture %y) nounwind uwtable ssp {
@@ -60,8 +60,7 @@ end:
 ; CHECK-NEXT: br label %loop
 }
 
-; Don't try to "sink" unordered stores yet; it is legal, but the machinery
-; isn't there.
+; We can sink an unordered store
 define i32 @test4(i32* nocapture noalias %x, i32* nocapture %y) nounwind uwtable ssp {
 entry:
   br label %loop
@@ -75,6 +74,149 @@ loop:
 end:
   ret i32 %vala
 ; CHECK-LABEL: define i32 @test4(
+; CHECK-LABEL: loop:
+; CHECK: load atomic i32, i32* %y monotonic
+; CHECK-NOT: store
+; CHECK-LABEL: end:
+; CHECK-NEXT:   %[[LCSSAPHI:.*]] = phi i32 [ %vala
+; CHECK:   store atomic i32 %[[LCSSAPHI]], i32* %x unordered, align 4
+}
+
+; We currently don't handle ordered atomics.
+define i32 @test5(i32* nocapture noalias %x, i32* nocapture %y) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic i32 %vala, i32* %x release, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test5(
 ; CHECK: load atomic i32, i32* %y monotonic
 ; CHECK-NEXT: store atomic
 }
+
+; We currently don't touch volatiles
+define i32 @test6(i32* nocapture noalias %x, i32* nocapture %y) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store volatile i32 %vala, i32* %x, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test6(
+; CHECK: load atomic i32, i32* %y monotonic
+; CHECK-NEXT: store volatile
+}
+
+; We currently don't touch volatiles
+define i32 @test6b(i32* nocapture noalias %x, i32* nocapture %y) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic volatile i32 %vala, i32* %x unordered, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test6b(
+; CHECK: load atomic i32, i32* %y monotonic
+; CHECK-NEXT: store atomic volatile
+}
+
+; Mixing unorder atomics and normal loads/stores is
+; current unimplemented
+define i32 @test7(i32* nocapture noalias %x, i32* nocapture %y) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  store i32 5, i32* %x
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic i32 %vala, i32* %x unordered, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test7(
+; CHECK: store i32 5, i32* %x
+; CHECK-NEXT: load atomic i32, i32* %y
+; CHECK-NEXT: store atomic i32
+}
+
+; Three provably noalias locations - we can sink normal and unordered, but
+;  not monotonic
+define i32 @test7b(i32* nocapture noalias %x, i32* nocapture %y, i32* noalias nocapture %z) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  store i32 5, i32* %x
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic i32 %vala, i32* %z unordered, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test7b(
+; CHECK: load atomic i32, i32* %y monotonic
+
+; CHECK-LABEL: end:
+; CHECK: store i32 5, i32* %x
+; CHECK: store atomic i32 %{{.+}}, i32* %z unordered, align 4
+}
+
+
+define i32 @test8(i32* nocapture noalias %x, i32* nocapture %y) {
+entry:
+  br label %loop
+
+loop:
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic i32 %vala, i32* %x unordered, align 4
+  fence release
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test8(
+; CHECK-LABEL: loop:
+; CHECK: load atomic i32, i32* %y monotonic
+; CHECK-NEXT: store atomic
+; CHECK-NEXT: fence
+}
+
+; Exact semantics of monotonic accesses are a bit vague in the C++ spec,
+; for the moment, be conservative and don't touch them.
+define i32 @test9(i32* nocapture noalias %x, i32* nocapture %y) {
+entry:
+  br label %loop
+
+loop:
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic i32 %vala, i32* %x monotonic, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test9(
+; CHECK-LABEL: loop:
+; CHECK: load atomic i32, i32* %y monotonic
+; CHECK-NEXT:   store atomic i32 %vala, i32* %x monotonic, align 4
+}
diff --git a/test/Transforms/LICM/constexpr.ll b/test/Transforms/LICM/constexpr.ll
index 8ffc73513600..488821ac8fd4 100644
--- a/test/Transforms/LICM/constexpr.ll
+++ b/test/Transforms/LICM/constexpr.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -S -basicaa -licm | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='lcssa,require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
 ; This fixes PR22460
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/test/Transforms/LICM/hoist-bitcast-load.ll b/test/Transforms/LICM/hoist-bitcast-load.ll
index 6ef00738820e..956c7283be31 100644
--- a/test/Transforms/LICM/hoist-bitcast-load.ll
+++ b/test/Transforms/LICM/hoist-bitcast-load.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -S -basicaa -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='loop-simplify,require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(simplify-cfg,licm)' -S < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(simplify-cfg,licm)' -S < %s | FileCheck %s
+
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Transforms/LICM/hoist-deref-load.ll b/test/Transforms/LICM/hoist-deref-load.ll
index e67becdeb5e4..b48c9e5c7b14 100644
--- a/test/Transforms/LICM/hoist-deref-load.ll
+++ b/test/Transforms/LICM/hoist-deref-load.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -S -basicaa -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='loop-simplify,require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(simplify-cfg,licm)' -S < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(simplify-cfg,licm)' -S < %s | FileCheck %s
+
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Transforms/LICM/hoist-fast-fdiv.ll b/test/Transforms/LICM/hoist-fast-fdiv.ll
new file mode 100644
index 000000000000..f61564fd726c
--- /dev/null
+++ b/test/Transforms/LICM/hoist-fast-fdiv.ll
@@ -0,0 +1,34 @@
+; RUN: opt -licm -S < %s | FileCheck %s
+
+; Function Attrs: noinline norecurse nounwind readnone ssp uwtable
+define zeroext i1 @f(double %v) #0 {
+entry:
+; CHECK-LABEL: @f(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: fdiv fast double 1.000000e+00, %v
+  br label %loop
+
+loop:                                       ; preds = %entry, %loop
+  %v3 = phi i32 [ 0, %entry ], [ %v11, %loop ]
+  %v4 = phi i32 [ 0, %entry ], [ %v12, %loop ]
+  %v5 = uitofp i32 %v4 to double
+
+; CHECK-LABEL: loop:
+; CHECK: fmul fast double
+; CHECK-NOT: fdiv
+  %v6 = fdiv fast double %v5, %v
+  %v7 = fptoui double %v6 to i64
+  %v8 = and i64 %v7, 1
+  %v9 = xor i64 %v8, 1
+  %v10 = trunc i64 %v9 to i32
+  %v11 = add i32 %v10, %v3
+  %v12 = add nuw i32 %v4, 1
+  %v13 = icmp eq i32 %v12, -1
+  br i1 %v13, label %end, label %loop
+
+end:                                      ; preds = %loop
+  %v15 = phi i32 [ %v11, %loop ]
+  %v16 = icmp ne i32 %v15, 0
+  ret i1 %v16
+}
+
diff --git a/test/Transforms/LICM/hoist-nounwind.ll b/test/Transforms/LICM/hoist-nounwind.ll
index e9720235893a..9fc4903b8302 100644
--- a/test/Transforms/LICM/hoist-nounwind.ll
+++ b/test/Transforms/LICM/hoist-nounwind.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -basicaa -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='lcssa,require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Transforms/LICM/hoist-round.ll b/test/Transforms/LICM/hoist-round.ll
index 9c6a3a180b50..87a7050668de 100644
--- a/test/Transforms/LICM/hoist-round.ll
+++ b/test/Transforms/LICM/hoist-round.ll
@@ -18,6 +18,7 @@ target datalayout = "E-m:e-p:32:32-i8:8:8-i16:16:16-i64:32:32-f64:32:32-v64:32:3
 ; CHECK: call float @llvm.copysign.f32
 ; CHECK: call float @llvm.minnum.f32
 ; CHECK: call float @llvm.maxnum.f32
+; CHECK: call float @llvm.powi.f32
 ; CHECK: for.body:
 
 define void @test(float %arg1, float %arg2) {
@@ -40,7 +41,8 @@ for.body:
   %tmp.8 = call float @llvm.copysign.f32(float %tmp.7, float %arg2)
   %tmp.9 = call float @llvm.minnum.f32(float %tmp.8, float %arg2)
   %tmp.10 = call float @llvm.maxnum.f32(float %tmp.9, float %arg2)
-  call void @consume(float %tmp.10)
+  %tmp.11 = call float @llvm.powi.f32(float %tmp.10, i32 4)
+  call void @consume(float %tmp.11)
   %IND.new = add i32 %IND, 1
   br label %for.head
 
@@ -60,3 +62,4 @@ declare float @llvm.fabs.f32(float)
 declare float @llvm.copysign.f32(float, float)
 declare float @llvm.minnum.f32(float, float)
 declare float @llvm.maxnum.f32(float, float)
+declare float @llvm.powi.f32(float, i32)
diff --git a/test/Transforms/LICM/hoisting.ll b/test/Transforms/LICM/hoisting.ll
index 29595b3e1cc0..cbd17689e939 100644
--- a/test/Transforms/LICM/hoisting.ll
+++ b/test/Transforms/LICM/hoisting.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -licm -S | FileCheck %s
-; RUN: opt -lcssa %s | opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S | FileCheck %s
+; RUN: opt < %s -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' -S | FileCheck %s
 
 @X = global i32 0		; <i32*> [#uses=1]
 
@@ -149,3 +149,174 @@ latch:
 return:
   ret i32 %sum
 }
+
+declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) nounwind readonly
+declare void @llvm.invariant.end.p0i8({}*, i64, i8* nocapture) nounwind
+declare void @escaping.invariant.start({}*) nounwind
+; invariant.start dominates the load, and in this scope, the
+; load is invariant. So, we can hoist the `addrld` load out of the loop.
+define i32 @test_fence(i8* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence
+; CHECK-LABEL: entry
+; CHECK: invariant.start
+; CHECK: %addrld = load atomic i32, i32* %addr.i unordered, align 8
+; CHECK: br label %loop
+entry: 
+  %gep = getelementptr inbounds i8, i8* %addr, i64 8
+  %addr.i = bitcast i8* %gep to i32 *
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}
+
+
+
+; Same as test above, but the load is no longer invariant (presence of
+; invariant.end). We cannot hoist the addrld out of loop.
+define i32 @test_fence1(i8* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence1
+; CHECK-LABEL: entry
+; CHECK: invariant.start
+; CHECK-NEXT: invariant.end
+; CHECK-NEXT: br label %loop
+entry:
+  %gep = getelementptr inbounds i8, i8* %addr, i64 8
+  %addr.i = bitcast i8* %gep to i32 *
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
+  call void @llvm.invariant.end.p0i8({}* %invst, i64 4, i8* %gep)
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}
+
+; same as test above, but instead of invariant.end, we have the result of
+; invariant.start escaping through a call. We cannot hoist the load.
+define i32 @test_fence2(i8* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence2
+; CHECK-LABEL: entry
+; CHECK-NOT: load
+; CHECK: br label %loop
+entry:
+  %gep = getelementptr inbounds i8, i8* %addr, i64 8
+  %addr.i = bitcast i8* %gep to i32 *
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
+  call void @escaping.invariant.start({}* %invst)
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}
+
+; FIXME: invariant.start dominates the load, and in this scope, the
+; load is invariant. So, we can hoist the `addrld` load out of the loop.
+; Consider the loadoperand addr.i bitcasted before being passed to
+; invariant.start
+define i32 @test_fence3(i32* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence3
+; CHECK-LABEL: entry
+; CHECK: invariant.start
+; CHECK-NOT: %addrld = load atomic i32, i32* %addr.i unordered, align 8
+; CHECK: br label %loop
+entry: 
+  %addr.i = getelementptr inbounds i32, i32* %addr, i64 8
+  %gep = bitcast i32* %addr.i to i8 *
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}
+
+; We should not hoist the addrld out of the loop.
+define i32 @test_fence4(i32* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence4
+; CHECK-LABEL: entry
+; CHECK-NOT: %addrld = load atomic i32, i32* %addr.i unordered, align 8
+; CHECK: br label %loop
+entry: 
+  %addr.i = getelementptr inbounds i32, i32* %addr, i64 8
+  %gep = bitcast i32* %addr.i to i8 *
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}
diff --git a/test/Transforms/LICM/loopsink.ll b/test/Transforms/LICM/loopsink.ll
index 5004752d1031..b203ea8b51ad 100644
--- a/test/Transforms/LICM/loopsink.ll
+++ b/test/Transforms/LICM/loopsink.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-sink < %s | FileCheck %s
+; RUN: opt -S -passes=loop-sink < %s | FileCheck %s
 
 @g = global i32 0, align 4
 
diff --git a/test/Transforms/LICM/opt-remarks.ll b/test/Transforms/LICM/opt-remarks.ll
index f0ef386c9f9a..b44fc57131a5 100644
--- a/test/Transforms/LICM/opt-remarks.ll
+++ b/test/Transforms/LICM/opt-remarks.ll
@@ -10,7 +10,7 @@ Loop:
   %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]
   %addr = getelementptr i32, i32* %array, i32 %j
   %a = load i32, i32* %addr
-; CHECK: remark: /tmp/kk.c:2:20: hosting load
+; CHECK: remark: /tmp/kk.c:2:20: hoisting load
   %b = load i32, i32* %p, !dbg !8
   %a2 = add i32 %a, %b
   store i32 %a2, i32* %addr
diff --git a/test/Transforms/LICM/pr32129.ll b/test/Transforms/LICM/pr32129.ll
new file mode 100644
index 000000000000..2618afe46322
--- /dev/null
+++ b/test/Transforms/LICM/pr32129.ll
@@ -0,0 +1,18 @@
+; RUN: opt -S -licm -loop-unswitch -licm < %s | FileCheck %s
+
+declare void @llvm.experimental.guard(i1, ...)
+
+define void @test() {
+; CHECK-LABEL: @test(
+; CHECK-NOT: guard
+entry:
+  br label %header
+
+header:
+  br label %loop
+
+loop:
+  %0 = icmp ult i32 0, 400
+  call void (i1, ...) @llvm.experimental.guard(i1 %0, i32 9) [ "deopt"() ]
+  br i1 undef, label %header, label %loop
+}
diff --git a/test/Transforms/LICM/scalar-promote-unwind.ll b/test/Transforms/LICM/scalar-promote-unwind.ll
new file mode 100644
index 000000000000..f1f52eed1d4c
--- /dev/null
+++ b/test/Transforms/LICM/scalar-promote-unwind.ll
@@ -0,0 +1,263 @@
+; RUN: opt < %s -basicaa -licm -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Make sure we don't hoist the store out of the loop; %a would
+; have the wrong value if f() unwinds
+
+define void @test1(i32* nocapture noalias %a, i1 zeroext %y) uwtable {
+entry:
+  br label %for.body
+
+for.body:
+  %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %0 = load i32, i32* %a, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* %a, align 4
+  br i1 %y, label %if.then, label %for.inc
+
+; CHECK: define void @test1
+; CHECK: load i32, i32*
+; CHECK-NEXT: add
+; CHECK-NEXT: store i32
+
+if.then:
+  tail call void @f()
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, 10000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+; We can hoist the store out of the loop here; if f() unwinds,
+; the lifetime of %a ends.
+
+define void @test2(i1 zeroext %y) uwtable {
+entry:
+  %a = alloca i32
+  br label %for.body
+
+for.body:
+  %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %0 = load i32, i32* %a, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* %a, align 4
+  br i1 %y, label %if.then, label %for.inc
+
+if.then:
+  tail call void @f()
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, 10000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+; CHECK: define void @test2
+; CHECK: store i32
+; CHECK-NEXT: ret void
+  ret void
+}
+
+@_ZTIi = external constant i8*
+
+; In this test, the loop is within a try block. There is an explicit unwind edge out of the loop.
+; Make sure this edge is treated as a loop exit, and that the loads and stores are promoted as
+; expected
+define void @loop_within_tryblock() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  %a = alloca i32, align 4
+  store i32 0, i32* %a, align 4
+  br label %for.cond
+
+for.cond:
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+; CHECK: for.body:
+; CHECK-NOT: load
+; CHECK-NOT: store 
+; CHECK: invoke
+for.body:
+  %0 = load i32, i32* %a, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* %a, align 4
+  invoke void @boo()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+; CHECK: lpad:
+; CHECK: store
+; CHECK: br
+lpad:
+  %1 = landingpad { i8*, i32 }
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %2 = extractvalue { i8*, i32 } %1, 0
+  %3 = extractvalue { i8*, i32 } %1, 1
+  br label %catch.dispatch
+
+catch.dispatch:
+  %4 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) #3
+  %matches = icmp eq i32 %3, %4
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:
+  %5 = call i8* @__cxa_begin_catch(i8* %2) #3
+  %6 = bitcast i8* %5 to i32*
+  %7 = load i32, i32* %6, align 4
+  call void @__cxa_end_catch() #3
+  br label %try.cont
+
+try.cont:
+  ret void
+
+for.end:
+  br label %try.cont
+
+eh.resume:
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %2, 0
+  %lpad.val3 = insertvalue { i8*, i32 } %lpad.val, i32 %3, 1
+  resume { i8*, i32 } %lpad.val3
+}
+
+
+; The malloc'ed memory is not capture and therefore promoted.
+define void @malloc_no_capture() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  %call = call i8* @malloc(i64 4)
+  %0 = bitcast i8* %call to i32*
+  br label %for.body
+
+; CHECK: for.body:
+; CHECK-NOT: load
+; CHECK-NOT: store
+; CHECK: br 
+for.body:
+  %i.0 = phi i32 [ 0, %entry  ], [ %inc, %for.latch ]
+  %1 = load i32, i32* %0, align 4
+  %add = add nsw i32 %1, 1
+  store i32 %add, i32* %0, align 4
+  br label %for.call
+
+for.call:
+  invoke void @boo()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  br label %for.latch
+
+for.latch:
+  %inc = add i32 %i.0, 1
+  %cmp = icmp slt i32 %i.0, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  br label %fun.ret
+
+lpad:
+  %2 = landingpad { i8*, i32 }
+          catch i8* null
+  %3 = extractvalue { i8*, i32 } %2, 0
+  %4 = extractvalue { i8*, i32 } %2, 1
+  br label %catch
+
+catch:
+  %5 = call i8* @__cxa_begin_catch(i8* %3) #4
+  %6 = bitcast i32* %0 to i8*
+  call void @free(i8* %6)
+  call void @__cxa_end_catch()
+  br label %fun.ret
+
+fun.ret:
+  ret void
+}
+
+; The malloc'ed memory can be captured and therefore not promoted.
+define void @malloc_capture(i32** noalias %A) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  %call = call i8* @malloc(i64 4)
+  %0 = bitcast i8* %call to i32*
+  br label %for.body
+
+; CHECK: for.body:
+; CHECK: load
+; CHECK: store
+; CHECK: br 
+for.body:
+  %i.0 = phi i32 [ 0, %entry  ], [ %inc, %for.latch ]
+  %1 = load i32, i32* %0, align 4
+  %add = add nsw i32 %1, 1
+  store i32 %add, i32* %0, align 4
+  br label %for.call
+
+for.call:
+  invoke void @boo_readnone()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  br label %for.latch
+
+for.latch:
+  store i32* %0, i32** %A 
+  %inc = add i32 %i.0, 1
+  %cmp = icmp slt i32 %i.0, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  br label %fun.ret
+
+lpad:
+  %2 = landingpad { i8*, i32 }
+          catch i8* null
+  %3 = extractvalue { i8*, i32 } %2, 0
+  %4 = extractvalue { i8*, i32 } %2, 1
+  br label %catch
+
+catch:
+  %5 = call i8* @__cxa_begin_catch(i8* %3) #4
+  %6 = bitcast i32* %0 to i8*
+  call void @free(i8* %6)
+  call void @__cxa_end_catch()
+  br label %fun.ret
+
+fun.ret:
+  ret void
+}
+
+; Function Attrs: nounwind
+declare noalias i8* @malloc(i64)
+
+; Function Attrs: nounwind
+declare void @free(i8* nocapture)
+
+declare void @boo() 
+
+; This is an artifical example, readnone functions by definition cannot unwind
+; exceptions by calling the C++ exception throwing methods
+; This function should only be used to test malloc_capture.
+declare void @boo_readnone() readnone
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+declare i32 @llvm.eh.typeid.for(i8*)
+
+declare void @f() uwtable
diff --git a/test/Transforms/LICM/scalar_promote.ll b/test/Transforms/LICM/scalar-promote.ll
index c88701154b8f..89888546494f 100644
--- a/test/Transforms/LICM/scalar_promote.ll
+++ b/test/Transforms/LICM/scalar-promote.ll
@@ -378,6 +378,33 @@ exit:
   ret i32 %ret
 }
 
+define void @test10(i32 %i) {
+Entry:
+  br label %Loop
+; CHECK-LABEL: @test10(
+; CHECK: Entry:
+; CHECK-NEXT:   load atomic i32, i32* @X unordered, align 4
+; CHECK-NEXT:   br label %Loop
+
+
+Loop:   ; preds = %Loop, %0
+  %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]    ; <i32> [#uses=1]
+  %x = load atomic i32, i32* @X unordered, align 4
+  %x2 = add i32 %x, 1
+  store atomic i32 %x2, i32* @X unordered, align 4
+  %Next = add i32 %j, 1
+  %cond = icmp eq i32 %Next, 0
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  ret void
+; CHECK: Out:
+; CHECK-NEXT:   %[[LCSSAPHI:.*]] = phi i32 [ %x2
+; CHECK-NEXT:   store atomic i32 %[[LCSSAPHI]], i32* @X unordered, align 4
+; CHECK-NEXT:   ret void
+
+}
+
 !0 = !{!4, !4, i64 0}
 !1 = !{!"omnipotent char", !2}
 !2 = !{!"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LICM/scalar_promote-unwind.ll b/test/Transforms/LICM/scalar_promote-unwind.ll
deleted file mode 100644
index dd3693b4af63..000000000000
--- a/test/Transforms/LICM/scalar_promote-unwind.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; RUN: opt < %s -basicaa -licm -S | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Make sure we don't hoist the store out of the loop; %a would
-; have the wrong value if f() unwinds
-
-define void @test1(i32* nocapture noalias %a, i1 zeroext %y) uwtable {
-entry:
-  br label %for.body
-
-for.body:
-  %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
-  %0 = load i32, i32* %a, align 4
-  %add = add nsw i32 %0, 1
-  store i32 %add, i32* %a, align 4
-  br i1 %y, label %if.then, label %for.inc
-
-; CHECK: define void @test1
-; CHECK: load i32, i32*
-; CHECK-NEXT: add
-; CHECK-NEXT: store i32
-
-if.then:
-  tail call void @f()
-  br label %for.inc
-
-for.inc:
-  %inc = add nuw nsw i32 %i.03, 1
-  %exitcond = icmp eq i32 %inc, 10000
-  br i1 %exitcond, label %for.cond.cleanup, label %for.body
-
-for.cond.cleanup:
-  ret void
-}
-
-; We can hoist the store out of the loop here; if f() unwinds,
-; the lifetime of %a ends.
-
-define void @test2(i1 zeroext %y) uwtable {
-entry:
-  %a = alloca i32
-  br label %for.body
-
-for.body:
-  %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
-  %0 = load i32, i32* %a, align 4
-  %add = add nsw i32 %0, 1
-  store i32 %add, i32* %a, align 4
-  br i1 %y, label %if.then, label %for.inc
-
-if.then:
-  tail call void @f()
-  br label %for.inc
-
-for.inc:
-  %inc = add nuw nsw i32 %i.03, 1
-  %exitcond = icmp eq i32 %inc, 10000
-  br i1 %exitcond, label %for.cond.cleanup, label %for.body
-
-for.cond.cleanup:
-  ret void
-
-; CHECK: define void @test2
-; CHECK: store i32
-; CHECK-NEXT: ret void
-  ret void
-}
-
-declare void @f() uwtable
diff --git a/test/Transforms/LICM/sink.ll b/test/Transforms/LICM/sink.ll
index cf169ddc12a9..70fa6fa13e3e 100644
--- a/test/Transforms/LICM/sink.ll
+++ b/test/Transforms/LICM/sink.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -S -licm < %s | FileCheck %s --check-prefix=CHECK-LICM
 ; RUN: opt -S -licm < %s | opt -S -loop-sink | FileCheck %s --check-prefix=CHECK-SINK
+; RUN: opt -S < %s -passes='require<opt-remark-emit>,loop(licm),loop-sink' \
+; RUN:     | FileCheck %s --check-prefix=CHECK-SINK
 
 ; Original source code:
 ; int g;
diff --git a/test/Transforms/LICM/unrolled-deeply-nested.ll b/test/Transforms/LICM/unrolled-deeply-nested.ll
new file mode 100644
index 000000000000..c0f2c9818000
--- /dev/null
+++ b/test/Transforms/LICM/unrolled-deeply-nested.ll
@@ -0,0 +1,76 @@
+; Test that LICM correctly detects conflicting accesses to memory in deeply
+; nested subloops. This works in the legacy PM due to a special retained map of
+; alias information for inner loops, and in the new PM it is recomputed for each
+; loop.
+;
+; RUN: opt -S -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' < %s | FileCheck %s
+; RUN: opt -S -basicaa -licm < %s | FileCheck %s
+
+define i32 @test(i32* %a, i64 %n.0, i64 %n.0.0, i64 %n.0.0.0, i64 %n.0.0.0.0) nounwind uwtable readonly {
+; CHECK-LABEL: define i32 @test
+entry:
+  %b = alloca i32
+  %c = alloca i32
+  %a.i8 = bitcast i32* %a to i8*
+  %b.i8 = bitcast i32* %b to i8*
+  %c.i8 = bitcast i32* %c to i8*
+  br label %l.0.header
+; CHECK: %b = alloca i32
+; CHECK: %c = alloca i32
+; CHECK: %[[AI8:.*]] = bitcast i32* %a to i8*
+; CHECK: %[[BI8:.*]] = bitcast i32* %b to i8*
+; CHECK: %[[CI8:.*]] = bitcast i32* %c to i8*
+; CHECK-NOT: load
+; CHECK: br
+
+l.0.header:
+  %iv.0 = phi i64 [ %iv.0.next, %l.0.latch ], [ 0, %entry ]
+  %iv.0.next = add i64 %iv.0, 1
+  %exitcond.0 = icmp eq i64 %iv.0.next, %n.0
+  %a.val = load i32, i32* %a
+  store i32 %a.val, i32* %b
+  %c.val = trunc i64 %iv.0 to i32
+  store i32 %c.val, i32* %c
+  br label %l.0.0.header
+; CHECK: %[[AV:.*]] = load i32, i32* %a
+; CHECK: store i32 %[[AV]], i32* %b
+; CHECK: %[[CT:.*]] = trunc i64 {{.*}} to i32
+; CHECK: store i32 %[[CT]], i32* %c
+; CHECK: br
+
+l.0.0.header:
+  %iv.0.0 = phi i64 [ %iv.0.0.next, %l.0.0.latch ], [ 0, %l.0.header ]
+  %iv.0.0.next = add i64 %iv.0.0, 1
+  %exitcond.0.0 = icmp eq i64 %iv.0.0.next, %n.0.0
+  br label %l.0.0.0.header
+; CHECK: br
+
+l.0.0.0.header:
+  %iv.0.0.0 = phi i64 [ %iv.0.0.0.next, %l.0.0.0.header ], [ 0, %l.0.0.header ]
+  %iv.0.0.0.next = add i64 %iv.0.0.0, 1
+  %exitcond.0.0.0 = icmp eq i64 %iv.0.0.0.next, %n.0.0.0
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a.i8, i8* %c.i8, i64 4, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %b.i8, i8* %c.i8, i64 4, i32 1, i1 false)
+  br i1 %exitcond.0.0.0, label %l.0.0.0.header, label %l.0.0.latch
+; CHECK: call void @llvm.memcpy.{{.*}}(i8* %[[AI8]], i8* %[[CI8]], i64 4
+; CHECK: call void @llvm.memcpy.{{.*}}(i8* %[[BI8]], i8* %[[CI8]], i64 4
+; CHECK: br
+
+l.0.0.latch:
+  br i1 %exitcond.0.0, label %l.0.0.header, label %l.0.latch
+; CHECK: br
+
+l.0.latch:
+  %b.val = load i32, i32* %b
+  br i1 %exitcond.0, label %exit, label %l.0.header
+; CHECK: %[[BV:.*]] = load i32, i32* %b
+; CHECK: br
+
+exit:
+  %result.lcssa = phi i32 [ %b.val, %l.0.latch ]
+  ret i32 %b.val
+; CHECK: %[[LCSSA:.*]] = phi i32 [ %[[BV]], %{{.*}} ]
+; CHECK: ret i32 %[[LCSSA]]
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
diff --git a/test/Transforms/LoadCombine/deadcode.ll b/test/Transforms/LoadCombine/deadcode.ll
new file mode 100644
index 000000000000..ed72824ffb44
--- /dev/null
+++ b/test/Transforms/LoadCombine/deadcode.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -load-combine -S < %s | FileCheck %s
+
+; It has been detected that dead loops like the one in this test case can be
+; created by -jump-threading (it was detected by a csmith generated program).
+;
+; According to -verify this is valid input (even if it could be discussed if
+; the dead loop really satisfies SSA form).
+;
+; The problem found was that the -load-combine pass ends up in an infinite loop
+; when analysing the 'bb1' basic block.
+define void @test1() {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    ret void
+; CHECK:       bb1:
+; CHECK-NEXT:    [[_TMP4:%.*]] = load i16, i16* [[_TMP10:%.*]], align 1
+; CHECK-NEXT:    [[_TMP10]] = getelementptr i16, i16* [[_TMP10]], i16 1
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[_TMP7:%.*]] = load i16, i16* [[_TMP12:%.*]], align 1
+; CHECK-NEXT:    [[_TMP12]] = getelementptr i16, i16* [[_TMP12]], i16 1
+; CHECK-NEXT:    br label [[BB2:%.*]]
+;
+  ret void
+
+bb1:
+  %_tmp4 = load i16, i16* %_tmp10, align 1
+  %_tmp10 = getelementptr i16, i16* %_tmp10, i16 1
+  br label %bb1
+
+; A second basic block. Running the test with -debug-pass=Executions shows
+; that we only run the Dominator Tree Construction one time for each function,
+; also when having multiple basic blocks in the function.
+bb2:
+  %_tmp7 = load i16, i16* %_tmp12, align 1
+  %_tmp12 = getelementptr i16, i16* %_tmp12, i16 1
+  br label %bb2
+
+}
diff --git a/test/Transforms/LoadCombine/load-combine-aa.ll b/test/Transforms/LoadCombine/load-combine-aa.ll
index fc639c0bc05d..5a577516fb47 100644
--- a/test/Transforms/LoadCombine/load-combine-aa.ll
+++ b/test/Transforms/LoadCombine/load-combine-aa.ll
@@ -1,4 +1,4 @@
-; RUN: opt -basicaa -load-combine -instcombine -S < %s | FileCheck %s
+; RUN: opt -basicaa -load-combine -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -22,6 +22,7 @@ define i64 @test1(i32* nocapture readonly noalias %a, i32* nocapture readonly no
 define i64 @test2(i32* nocapture readonly %a, i32* nocapture readonly %b) {
 ; CHECK-LABEL: @test2
 
+; CHECK-NOT: load i64
 ; CHECK: load i32, i32*
 ; CHECK: load i32, i32*
 ; CHECK: ret i64
@@ -37,3 +38,26 @@ define i64 @test2(i32* nocapture readonly %a, i32* nocapture readonly %b) {
   ret i64 %add
 }
 
+%rec11 = type { i16, i16, i16 }
+@str = global %rec11 { i16 1, i16 2, i16 3 }
+
+; PR31517 - Check that loads which span an aliasing store are not combined.
+define i16 @test3() {
+; CHECK-LABEL: @test3
+
+; CHECK-NOT: load i32
+; CHECK: load i16, i16*
+; CHECK: store i16
+; CHECK: load i16, i16*
+; CHECK: ret i16
+
+  %_tmp9 = getelementptr %rec11, %rec11* @str, i16 0, i32 1
+  %_tmp10 = load i16, i16* %_tmp9
+  %_tmp12 = getelementptr %rec11, %rec11* @str, i16 0, i32 0
+  store i16 %_tmp10, i16* %_tmp12
+  %_tmp13 = getelementptr %rec11, %rec11* @str, i16 0, i32 0
+  %_tmp14 = load i16, i16* %_tmp13
+  %_tmp15 = icmp eq i16 %_tmp14, 3
+  %_tmp16 = select i1 %_tmp15, i16 1, i16 0
+  ret i16 %_tmp16
+}
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
index e6904ee50bca..4b2dab47a20f 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
@@ -15,7 +15,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; NOSCOPE: load float
 ; NOSCOPE: store float
 ; NOSCOPE: store float
-define void @vectorize_alias_scope(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
+define amdgpu_kernel void @vectorize_alias_scope(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
 entry:
   %a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
   store float 0.0, float addrspace(1)* %a, align 4, !noalias !0
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
index 4369dafa4258..368dc6ab361e 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
@@ -10,7 +10,7 @@ target triple = "amdgcn--"
 
 ; ALIGNED: load i8, i8* %ptr0, align 1{{$}}
 ; ALIGNED: load i8, i8* %ptr1, align 1{{$}}
-define void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i8], align 1
   %ptr0 = getelementptr inbounds [128 x i8], [128 x i8]* %alloca, i32 0, i32 %offset
   %val0 = load i8, i8* %ptr0, align 1
@@ -27,7 +27,7 @@ define void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %o
 
 ; ALIGNED: load i16, i16* %ptr0, align 1{{$}}
 ; ALIGNED: load i16, i16* %ptr1, align 1{{$}}
-define void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i16], align 1
   %ptr0 = getelementptr inbounds [128 x i16], [128 x i16]* %alloca, i32 0, i32 %offset
   %val0 = load i16, i16* %ptr0, align 1
@@ -47,7 +47,7 @@ define void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32
 
 ; ALIGNED: load i32, i32* %ptr0, align 1
 ; ALIGNED: load i32, i32* %ptr1, align 1
-define void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i32], align 1
   %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset
   %val0 = load i32, i32* %ptr0, align 1
@@ -64,8 +64,11 @@ define void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32
 ; ALL: alloca [128 x i32], align 16
 
 ; UNALIGNED: load <2 x i32>, <2 x i32>* %{{[0-9]+}}, align 1{{$}}
-; ALIGNED: load <2 x i32>, <2 x i32>* %{{[0-9]+}}, align 4{{$}}
-define void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+
+; FIXME: Should change alignment
+; ALIGNED: load i32
+; ALIGNED: load i32
+define amdgpu_kernel void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i32], align 16
   %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset
   %val0 = load i32, i32* %ptr0, align 1
@@ -82,7 +85,7 @@ define void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %
 
 ; ALIGNED: store i8 9, i8* %ptr0, align 1{{$}}
 ; ALIGNED: store i8 10, i8* %ptr1, align 1{{$}}
-define void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i8], align 1
   %ptr0 = getelementptr inbounds [128 x i8], [128 x i8]* %alloca, i32 0, i32 %offset
   store i8 9, i8* %ptr0, align 1
@@ -97,7 +100,7 @@ define void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %
 
 ; ALIGNED: store i16 9, i16* %ptr0, align 1{{$}}
 ; ALIGNED: store i16 10, i16* %ptr1, align 1{{$}}
-define void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i16], align 1
   %ptr0 = getelementptr inbounds [128 x i16], [128 x i16]* %alloca, i32 0, i32 %offset
   store i16 9, i16* %ptr0, align 1
@@ -116,7 +119,7 @@ define void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32
 
 ; ALIGNED: store i32 9, i32* %ptr0, align 1
 ; ALIGNED: store i32 10, i32* %ptr1, align 1
-define void @store_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @store_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i32], align 1
   %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset
   store i32 9, i32* %ptr0, align 1
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
index 25abb98c6ebd..8a75b8743fa5 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
@@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 ; CHECK: sext i32 %id.x to i64
 ; CHECK: load <2 x float>
 ; CHECK: store <2 x float> zeroinitializer
-define void @basic_merge_sext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
+define amdgpu_kernel void @basic_merge_sext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
 entry:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %sext.id.x = sext i32 %id.x to i64
@@ -32,7 +32,7 @@ entry:
 ; CHECK: zext i32 %id.x to i64
 ; CHECK: load <2 x float>
 ; CHECK: store <2 x float>
-define void @basic_merge_zext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
+define amdgpu_kernel void @basic_merge_zext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
 entry:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %zext.id.x = zext i32 %id.x to i64
@@ -54,7 +54,7 @@ entry:
 ; CHECK-LABEL: @merge_op_zext_index(
 ; CHECK: load <2 x float>
 ; CHECK: store <2 x float>
-define void @merge_op_zext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
+define amdgpu_kernel void @merge_op_zext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
 entry:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %shl = shl i32 %id.x, 2
@@ -81,7 +81,7 @@ entry:
 ; CHECK-LABEL: @merge_op_sext_index(
 ; CHECK: load <2 x float>
 ; CHECK: store <2 x float>
-define void @merge_op_sext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
+define amdgpu_kernel void @merge_op_sext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
 entry:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %shl = shl i32 %id.x, 2
@@ -112,7 +112,7 @@ entry:
 ; CHECK: loop:
 ; CHECK: load <2 x i32>
 ; CHECK: store <2 x i32>
-define void @zext_trunc_phi_1(i32 addrspace(1)* nocapture noalias %a, i32 addrspace(1)* nocapture noalias %b, i32 addrspace(1)* nocapture readonly noalias %c, i32 %n, i64 %arst, i64 %aoeu) #0 {
+define amdgpu_kernel void @zext_trunc_phi_1(i32 addrspace(1)* nocapture noalias %a, i32 addrspace(1)* nocapture noalias %b, i32 addrspace(1)* nocapture readonly noalias %c, i32 %n, i64 %arst, i64 %aoeu) #0 {
 entry:
   %cmp0 = icmp eq i32 %n, 0
   br i1 %cmp0, label %exit, label %loop
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
index 2b2f9cbcf508..6182c09abcfe 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
@@ -11,7 +11,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; CHECK: load <2 x float>
 ; CHECK: %w = add i32 %y, 9
 ; CHECK: %foo = add i32 %z, %w
-define void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
 entry:
   %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
   %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx
@@ -38,7 +38,7 @@ entry:
 ; CHECK: %w = add i32 %y, 9
 ; CHECK: store <2 x float>
 ; CHECK: %foo = add i32 %z, %w
-define void @insert_store_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @insert_store_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
 entry:
   %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
   %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll
index 4d6240a9aa9d..3f6d7ee7dcac 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll
@@ -8,7 +8,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; CHECK: store double 0.000000e+00, double addrspace(1)* %a,
 ; CHECK: load double
 ; CHECK: store double 0.000000e+00, double addrspace(1)* %a.idx.1
-define void @interleave(double addrspace(1)* nocapture %a, double addrspace(1)* nocapture %b, double addrspace(1)* nocapture readonly %c) #0 {
+define amdgpu_kernel void @interleave(double addrspace(1)* nocapture %a, double addrspace(1)* nocapture %b, double addrspace(1)* nocapture readonly %c) #0 {
 entry:
   %a.idx.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
   %c.idx.1 = getelementptr inbounds double, double addrspace(1)* %c, i64 1
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
index fd0aaa615db0..0fcdc7b9083a 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
@@ -1,8 +1,9 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4 -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ALIGNED,ALL %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8 -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ALIGNED,ALL %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=ELT8-UNALIGNED -check-prefix=ALL %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16 -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ALIGNED,ALL %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=ELT16-UNALIGNED -check-prefix=ALL %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4,-unaligned-scratch-access  -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ELT4-ALIGNED,ALIGNED,ALL %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,-unaligned-scratch-access  -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-ALIGNED,ALIGNED,ALL %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-ALIGNED,ALIGNED,ALL %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4,+unaligned-scratch-access  -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ELT4-UNALIGNED,UNALIGNED,ALL %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,+unaligned-scratch-access  -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-UNALIGNED,UNALIGNED,ALL %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-UNALIGNED,UNALIGNED,ALL %s
 
 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
 
@@ -16,7 +17,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; ELT8-UNALIGNED: store <2 x i32>
 
 ; ELT16-UNALIGNED: store <4 x i32>
-define void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 {
   %out.gep.1 = getelementptr i32, i32* %out, i32 1
   %out.gep.2 = getelementptr i32, i32* %out, i32 2
   %out.gep.3 = getelementptr i32, i32* %out, i32 3
@@ -28,9 +29,63 @@ define void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 {
   ret void
 }
 
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align1(
+; ALIGNED: store i32 9, i32* %out, align 1
+; ALIGNED: store i32 1, i32* %out.gep.1, align 1
+; ALIGNED: store i32 23, i32* %out.gep.2, align 1
+; ALIGNED: store i32 19, i32* %out.gep.3, align 1
+
+; ELT16-UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32>* %1, align 1
+
+; ELT8-UNALIGNED: store <2 x i32> <i32 9, i32 1>, <2 x i32>* %1, align 1
+; ELT8-UNALIGNED: store <2 x i32> <i32 23, i32 19>, <2 x i32>* %2, align 1
+
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align1(i32* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32* %out, i32 3
+
+  store i32 9, i32* %out, align 1
+  store i32 1, i32* %out.gep.1, align 1
+  store i32 23, i32* %out.gep.2, align 1
+  store i32 19, i32* %out.gep.3, align 1
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align2(
+; ALIGNED: store i32 9, i32* %out, align 2
+; ALIGNED: store i32 1, i32* %out.gep.1, align 2
+; ALIGNED: store i32 23, i32* %out.gep.2, align 2
+; ALIGNED: store i32 19, i32* %out.gep.3, align 2
+
+; ELT16-UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32>* %1, align 2
+
+; ELT8-UNALIGNED: store <2 x i32>
+; ELT8-UNALIGNED: store <2 x i32>
+
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align2(i32* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32* %out, i32 3
+
+  store i32 9, i32* %out, align 2
+  store i32 1, i32* %out.gep.1, align 2
+  store i32 23, i32* %out.gep.2, align 2
+  store i32 19, i32* %out.gep.3, align 2
+  ret void
+}
+
 ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8(
 ; ALL: store <4 x i8>
-define void @merge_private_store_4_vector_elts_loads_v4i8(i8* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8(i8* %out) #0 {
   %out.gep.1 = getelementptr i8, i8* %out, i32 1
   %out.gep.2 = getelementptr i8, i8* %out, i32 2
   %out.gep.3 = getelementptr i8, i8* %out, i32 3
@@ -42,9 +97,28 @@ define void @merge_private_store_4_vector_elts_loads_v4i8(i8* %out) #0 {
   ret void
 }
 
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8_align1(
+; ALIGNED: store i8
+; ALIGNED: store i8
+; ALIGNED: store i8
+; ALIGNED: store i8
+
+; UNALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8>* %1, align 1
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8_align1(i8* %out) #0 {
+  %out.gep.1 = getelementptr i8, i8* %out, i32 1
+  %out.gep.2 = getelementptr i8, i8* %out, i32 2
+  %out.gep.3 = getelementptr i8, i8* %out, i32 3
+
+  store i8 9, i8* %out, align 1
+  store i8 1, i8* %out.gep.1, align 1
+  store i8 23, i8* %out.gep.2, align 1
+  store i8 19, i8* %out.gep.3, align 1
+  ret void
+}
+
 ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16(
 ; ALL: store <2 x i16>
-define void @merge_private_store_4_vector_elts_loads_v2i16(i16* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16(i16* %out) #0 {
   %out.gep.1 = getelementptr i16, i16* %out, i32 1
 
   store i16 9, i16* %out, align 4
@@ -52,4 +126,106 @@ define void @merge_private_store_4_vector_elts_loads_v2i16(i16* %out) #0 {
   ret void
 }
 
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align2(
+; ALIGNED: store i16
+; ALIGNED: store i16
+
+; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16>* %1, align 2
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align2(i16* %out) #0 {
+  %out.gep.1 = getelementptr i16, i16* %out, i32 1
+
+  store i16 9, i16* %out, align 2
+  store i16 12, i16* %out.gep.1, align 2
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align1(
+; ALIGNED: store i16
+; ALIGNED: store i16
+
+; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16>* %1, align 1
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align1(i16* %out) #0 {
+  %out.gep.1 = getelementptr i16, i16* %out, i32 1
+
+  store i16 9, i16* %out, align 1
+  store i16 12, i16* %out.gep.1, align 1
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align8(
+; ALL: store <2 x i16> <i16 9, i16 12>, <2 x i16>* %1, align 8
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align8(i16* %out) #0 {
+  %out.gep.1 = getelementptr i16, i16* %out, i32 1
+
+  store i16 9, i16* %out, align 8
+  store i16 12, i16* %out.gep.1, align 2
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i32
+; ELT4: store i32
+; ELT4: store i32
+; ELT4: store i32
+
+; ELT8-ALIGNED: store i32
+; ELT8-ALIGNED: store i32
+; ELT8-ALIGNED: store i32
+
+; ELT8-UNALIGNED: store <2 x i32>
+; ELT8-UNALIGNED: store i32
+
+; ELT16-ALIGNED: store i32
+; ELT16-ALIGNED: store i32
+; ELT16-ALIGNED: store i32
+
+; ELT16-UNALIGNED: store <3 x i32>
+define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32(i32* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32* %out, i32 2
+
+  store i32 9, i32* %out
+  store i32 1, i32* %out.gep.1
+  store i32 23, i32* %out.gep.2
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i32_align1(
+; ALIGNED: store i32
+; ALIGNED: store i32
+; ALIGNED: store i32
+
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+
+; ELT8-UNALIGNED: store <2 x i32>
+; ELT8-UNALIGNED: store i32
+
+; ELT16-UNALIGNED: store <3 x i32>
+define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32_align1(i32* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32* %out, i32 2
+
+  store i32 9, i32* %out, align 1
+  store i32 1, i32* %out.gep.1, align 1
+  store i32 23, i32* %out.gep.2, align 1
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i8_align1(
+; ALIGNED: store i8
+; ALIGNED: store i8
+; ALIGNED: store i8
+
+; UNALIGNED: store <3 x i8>
+define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i8_align1(i8* %out) #0 {
+  %out.gep.1 = getelementptr i8, i8* %out, i8 1
+  %out.gep.2 = getelementptr i8, i8* %out, i8 2
+
+  store i8 9, i8* %out, align 1
+  store i8 1, i8* %out.gep.1, align 1
+  store i8 23, i8* %out.gep.2, align 1
+  ret void
+}
+
 attributes #0 = { nounwind }
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
index d32387fa2c06..dbb7068eeae0 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
@@ -10,7 +10,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i8(
 ; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(1)* %{{[0-9]+}}, align 2
-define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
 
   store i8 123, i8 addrspace(1)* %out.gep.1
@@ -20,7 +20,7 @@ define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i8_natural_align
 ; CHECK: store <2 x i8>
-define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
 
   store i8 123, i8 addrspace(1)* %out.gep.1
@@ -30,7 +30,7 @@ define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %o
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i16
 ; CHECK: store <2 x i16> <i16 456, i16 123>, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 123, i16 addrspace(1)* %out.gep.1
@@ -40,7 +40,7 @@ define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_2_constants_0_i16
 ; CHECK: store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 0, i16 addrspace(1)* %out.gep.1
@@ -50,7 +50,7 @@ define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i16_natural_align
 ; CHECK: store <2 x i16>
-define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 123, i16 addrspace(1)* %out.gep.1
@@ -60,7 +60,7 @@ define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)*
 
 ; CHECK-LABEL: @merge_global_store_2_constants_half_natural_align
 ; CHECK: store <2 x half>
-define void @merge_global_store_2_constants_half_natural_align(half addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_half_natural_align(half addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
 
   store half 2.0, half addrspace(1)* %out.gep.1
@@ -70,7 +70,7 @@ define void @merge_global_store_2_constants_half_natural_align(half addrspace(1)
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i32
 ; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 
   store i32 123, i32 addrspace(1)* %out.gep.1
@@ -80,7 +80,7 @@ define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i32_f32
 ; CHECK: store <2 x i32> <i32 456, i32 1065353216>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
   store float 1.0, float addrspace(1)* %out.gep.1.bc
@@ -90,7 +90,7 @@ define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_2_constants_f32_i32
 ; CHECK  store <2 x float> <float 4.000000e+00, float 0x370EC00000000000>, <2 x float> addrspace(1)* %{{[0-9]+$}}
-define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
   store i32 123, i32 addrspace(1)* %out.gep.1.bc
@@ -100,7 +100,7 @@ define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0
 
 ; CHECK-LABEL: @merge_global_store_4_constants_i32
 ; CHECK: store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -114,7 +114,7 @@ define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_4_constants_f32_order
 ; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}}
-define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -129,7 +129,7 @@ define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out)
 ; First store is out of order.
 ; CHECK-LABEL: @merge_global_store_4_constants_f32
 ; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -143,7 +143,7 @@ define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_4_constants_mixed_i32_f32
 ; CHECK: store <4 x i32> <i32 1090519040, i32 11, i32 1073741824, i32 17>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -160,7 +160,7 @@ define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %o
 
 ; CHECK-LABEL: @merge_global_store_3_constants_i32
 ; CHECK: store <3 x i32> <i32 1234, i32 123, i32 456>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 
@@ -172,7 +172,7 @@ define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i64
 ; CHECK: store <2 x i64> <i64 456, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
-define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 
   store i64 123, i64 addrspace(1)* %out.gep.1
@@ -183,7 +183,7 @@ define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_global_store_4_constants_i64
 ; CHECK: store <2 x i64> <i64 456, i64 333>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
 ; CHECK: store <2 x i64> <i64 1234, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
-define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
   %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
   %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
@@ -202,7 +202,7 @@ define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
 ; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT0]], i32 0
 ; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT1]], i32 1
 ; CHECK: store <2 x i32> [[INSERT1]]
-define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 
@@ -220,7 +220,7 @@ define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32
 ; CHECK: insertelement
 ; CHECK: insertelement
 ; CHECK: store <2 x i32>
-define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 
@@ -241,7 +241,7 @@ define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(
 ; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT1]], i32 0
 ; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT0]], i32 1
 ; CHECK: store <2 x i32> [[INSERT1]]
-define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 
@@ -256,7 +256,7 @@ define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -279,7 +279,7 @@ define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32
 ; CHECK-LABEL: @merge_global_store_3_adjacent_loads_i32
 ; CHECK: load <3 x i32>
 ; CHECK: store <3 x i32>
-define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
@@ -298,7 +298,7 @@ define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_f32
 ; CHECK: load <4 x float>
 ; CHECK: store <4 x float>
-define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -321,7 +321,7 @@ define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, f
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32_nonzero_base
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
@@ -346,7 +346,7 @@ define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_inverse_i32
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -373,7 +373,7 @@ define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_shuffle_i32
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -408,7 +408,7 @@ define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %
 ; CHECK: insertelement <4 x i8>
 ; CHECK: insertelement <4 x i8>
 ; CHECK: store <4 x i8>
-define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
@@ -431,7 +431,7 @@ define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 ad
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8_natural_align
 ; CHECK: load <4 x i8>
 ; CHECK: store <4 x i8>
-define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
@@ -454,7 +454,7 @@ define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1
 ; CHECK-LABEL: @merge_global_store_4_vector_elts_loads_v4i32
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -474,7 +474,7 @@ define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out
 
 ; CHECK-LABEL: @merge_local_store_2_constants_i8
 ; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(3)* %{{[0-9]+}}, align 2
-define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
 
   store i8 123, i8 addrspace(3)* %out.gep.1
@@ -484,7 +484,7 @@ define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
 
 ; CHECK-LABEL: @merge_local_store_2_constants_i32
 ; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(3)* %{{[0-9]+}}, align 4
-define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 
   store i32 123, i32 addrspace(3)* %out.gep.1
@@ -495,7 +495,7 @@ define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
 ; CHECK-LABEL: @merge_local_store_2_constants_i32_align_2
 ; CHECK: store i32
 ; CHECK: store i32
-define void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 
   store i32 123, i32 addrspace(3)* %out.gep.1, align 2
@@ -506,7 +506,7 @@ define void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #
 ; CHECK-LABEL: @merge_local_store_4_constants_i32
 ; CHECK: store <2 x i32> <i32 456, i32 333>, <2 x i32> addrspace(3)*
 ; CHECK: store <2 x i32> <i32 1234, i32 123>, <2 x i32> addrspace(3)*
-define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
@@ -521,7 +521,7 @@ define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
 ; CHECK-LABEL: @merge_global_store_5_constants_i32
 ; CHECK: store <4 x i32> <i32 9, i32 12, i32 16, i32 -12>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 ; CHECK: store i32
-define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
   store i32 9, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 12, i32 addrspace(1)* %idx1, align 4
@@ -537,7 +537,7 @@ define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
 ; CHECK-LABEL: @merge_global_store_6_constants_i32
 ; CHECK: store <4 x i32> <i32 13, i32 15, i32 62, i32 63>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 ; CHECK: store <2 x i32> <i32 11, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
   store i32 13, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 15, i32 addrspace(1)* %idx1, align 4
@@ -555,7 +555,7 @@ define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
 ; CHECK-LABEL: @merge_global_store_7_constants_i32
 ; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 ; CHECK: store <3 x i32> <i32 98, i32 91, i32 212>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
   store i32 34, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 999, i32 addrspace(1)* %idx1, align 4
@@ -575,7 +575,7 @@ define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
 ; CHECK-LABEL: @merge_global_store_8_constants_i32
 ; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 ; CHECK: store <4 x i32> <i32 98, i32 91, i32 212, i32 999>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
   store i32 34, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 999, i32 addrspace(1)* %idx1, align 4
@@ -597,7 +597,7 @@ define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
 ; CHECK-LABEL: @copy_v3i32_align4
 ; CHECK: %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
 ; CHECK: store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
-define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
   store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
   ret void
@@ -606,7 +606,7 @@ define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> a
 ; CHECK-LABEL: @copy_v3i64_align4
 ; CHECK: %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
 ; CHECK: store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
-define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
   store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
   ret void
@@ -615,7 +615,7 @@ define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> a
 ; CHECK-LABEL: @copy_v3f32_align4
 ; CHECK: %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
 ; CHECK: store <3 x float>
-define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
   %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
   store <3 x float> %fadd, <3 x float> addrspace(1)* %out
@@ -625,7 +625,7 @@ define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x floa
 ; CHECK-LABEL: @copy_v3f64_align4
 ; CHECK: %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
 ; CHECK: store <3 x double> %fadd, <3 x double> addrspace(1)* %out
-define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
   %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
   store <3 x double> %fadd, <3 x double> addrspace(1)* %out
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
index 8885d61014fc..226147df66a6 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
@@ -5,7 +5,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; CHECK-LABEL: @merge_v2i32_v2i32(
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32> zeroinitializer
-define void @merge_v2i32_v2i32(<2 x i32> addrspace(1)* nocapture %a, <2 x i32> addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v2i32_v2i32(<2 x i32> addrspace(1)* nocapture %a, <2 x i32> addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %b, i64 1
@@ -22,7 +22,7 @@ entry:
 ; CHECK-LABEL: @merge_v1i32_v1i32(
 ; CHECK: load <2 x i32>
 ; CHECK: store <2 x i32> zeroinitializer
-define void @merge_v1i32_v1i32(<1 x i32> addrspace(1)* nocapture %a, <1 x i32> addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v1i32_v1i32(<1 x i32> addrspace(1)* nocapture %a, <1 x i32> addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %b, i64 1
@@ -41,7 +41,7 @@ entry:
 ; CHECK: load <3 x i32>
 ; CHECK: store <3 x i32> zeroinitializer
 ; CHECK: store <3 x i32> zeroinitializer
-define void @no_merge_v3i32_v3i32(<3 x i32> addrspace(1)* nocapture %a, <3 x i32> addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @no_merge_v3i32_v3i32(<3 x i32> addrspace(1)* nocapture %a, <3 x i32> addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %b, i64 1
@@ -58,7 +58,7 @@ entry:
 ; CHECK-LABEL: @merge_v2i16_v2i16(
 ; CHECK: load <4 x i16>
 ; CHECK: store <4 x i16> zeroinitializer
-define void @merge_v2i16_v2i16(<2 x i16> addrspace(1)* nocapture %a, <2 x i16> addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v2i16_v2i16(<2 x i16> addrspace(1)* nocapture %a, <2 x i16> addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b, i64 1
@@ -76,7 +76,7 @@ entry:
 ; CHECK-LABEL: @merge_load_i32_v2i16(
 ; CHECK: load i32,
 ; CHECK: load <2 x i16>
-define void @merge_load_i32_v2i16(i32 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_i32_v2i16(i32 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 1
   %a.1.cast = bitcast i32 addrspace(1)* %a.1 to <2 x i16> addrspace(1)*
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
index ba792f783533..f353106607d6 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
@@ -7,7 +7,7 @@
 
 ; CHECK-LABEL: @load_keep_base_alignment_missing_align(
 ; CHECK: load <2 x float>, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4
-define void @load_keep_base_alignment_missing_align(float addrspace(1)* %out) {
+define amdgpu_kernel void @load_keep_base_alignment_missing_align(float addrspace(1)* %out) {
   %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11
   %val0 = load float, float addrspace(3)* %ptr0
 
@@ -21,7 +21,7 @@ define void @load_keep_base_alignment_missing_align(float addrspace(1)* %out) {
 
 ; CHECK-LABEL: @store_keep_base_alignment_missing_align(
 ; CHECK: store <2 x float> zeroinitializer, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4
-define void @store_keep_base_alignment_missing_align() {
+define amdgpu_kernel void @store_keep_base_alignment_missing_align() {
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 1
   %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2
   store float 0.0, float addrspace(3)* %arrayidx0
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
index 88eca363902f..8a78f3d7e9bc 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
@@ -11,7 +11,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:
 ; CHECK: store i32 0
 ; CHECK: store i32 0
 
-define void @no_crash(i32 %arg) {
+define amdgpu_kernel void @no_crash(i32 %arg) {
   %tmp2 = add i32 %arg, 14
   %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp2
   %tmp4 = add i32 %arg, 15
@@ -37,7 +37,7 @@ define void @no_crash(i32 %arg) {
 ; CHECK: load i32
 ; CHECK: load i32
 
-define void @interleave_get_longest(i32 %arg) {
+define amdgpu_kernel void @interleave_get_longest(i32 %arg) {
   %a1 = add i32 %arg, 1
   %a2 = add i32 %arg, 2
   %a3 = add i32 %arg, 3
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll
index 4a429533df02..818189565b4c 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll
@@ -5,7 +5,7 @@
 ; CHECK: store i32
 ; CHECK: store i32
 ; CHECK: store i32
-define void @no_implicit_float(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @no_implicit_float(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll
index 141e20a1f83c..28d29f8e8139 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: @optnone(
 ; CHECK: store i32
 ; CHECK: store i32
-define void @optnone(i32 addrspace(1)* %out) noinline optnone {
+define amdgpu_kernel void @optnone(i32 addrspace(1)* %out) noinline optnone {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 
   store i32 123, i32 addrspace(1)* %out.gep.1
@@ -13,7 +13,7 @@ define void @optnone(i32 addrspace(1)* %out) noinline optnone {
 
 ; CHECK-LABEL: @do_opt(
 ; CHECK: store <2 x i32>
-define void @do_opt(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @do_opt(i32 addrspace(1)* %out) {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 
   store i32 123, i32 addrspace(1)* %out.gep.1
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
index 202e988ea5f1..65200b95d5e6 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 ; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)*
 ; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)*
 ; CHECK: store <2 x i64> zeroinitializer
-define void @merge_v2p1i8(i8 addrspace(1)* addrspace(1)* nocapture %a, i8 addrspace(1)* addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v2p1i8(i8 addrspace(1)* addrspace(1)* nocapture %a, i8 addrspace(1)* addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %b, i64 1
@@ -28,7 +28,7 @@ entry:
 ; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)*
 ; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)*
 ; CHECK: store <2 x i32> zeroinitializer
-define void @merge_v2p3i8(i8 addrspace(3)* addrspace(3)* nocapture %a, i8 addrspace(3)* addrspace(3)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v2p3i8(i8 addrspace(3)* addrspace(3)* nocapture %a, i8 addrspace(3)* addrspace(3)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i64 1
   %b.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %b, i64 1
@@ -46,7 +46,7 @@ entry:
 ; CHECK: load <2 x i64>
 ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
 ; CHECK: inttoptr i64 [[ELT1]] to i8 addrspace(1)*
-define void @merge_load_i64_ptr64(i64 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_i64_ptr64(i64 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
   %a.1.cast = bitcast i64 addrspace(1)* %a.1 to i8 addrspace(1)* addrspace(1)*
@@ -61,7 +61,7 @@ entry:
 ; CHECK: load <2 x i64>
 ; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0
 ; CHECK: inttoptr i64 [[ELT0]] to i8 addrspace(1)*
-define void @merge_load_ptr64_i64(i64 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_ptr64_i64(i64 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
   %a.1 =  getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
@@ -76,7 +76,7 @@ entry:
 ; CHECK: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr0 to i64
 ; CHECK: insertelement <2 x i64> undef, i64 [[ELT0]], i32 0
 ; CHECK: store <2 x i64>
-define void @merge_store_ptr64_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, i64 %val1) #0 {
+define amdgpu_kernel void @merge_store_ptr64_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, i64 %val1) #0 {
 entry:
   %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
   %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
@@ -92,7 +92,7 @@ entry:
 ; CHECK: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr1 to i64
 ; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1]], i32 1
 ; CHECK: store <2 x i64>
-define void @merge_store_i64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(1)* %ptr1) #0 {
+define amdgpu_kernel void @merge_store_i64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(1)* %ptr1) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
   %a.cast = bitcast i8 addrspace(1)* addrspace(1)* %a to i64 addrspace(1)*
@@ -107,7 +107,7 @@ entry:
 ; CHECK: load <2 x i32>
 ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 1
 ; CHECK: inttoptr i32 [[ELT1]] to i8 addrspace(3)*
-define void @merge_load_i32_ptr32(i32 addrspace(3)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_i32_ptr32(i32 addrspace(3)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
   %a.1.cast = bitcast i32 addrspace(3)* %a.1 to i8 addrspace(3)* addrspace(3)*
@@ -122,7 +122,7 @@ entry:
 ; CHECK: load <2 x i32>
 ; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 0
 ; CHECK: inttoptr i32 [[ELT0]] to i8 addrspace(3)*
-define void @merge_load_ptr32_i32(i32 addrspace(3)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_ptr32_i32(i32 addrspace(3)* nocapture %a) #0 {
 entry:
   %a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)*
   %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
@@ -137,7 +137,7 @@ entry:
 ; CHECK: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr0 to i32
 ; CHECK: insertelement <2 x i32> undef, i32 [[ELT0]], i32 0
 ; CHECK: store <2 x i32>
-define void @merge_store_ptr32_i32(i32 addrspace(3)* nocapture %a, i8 addrspace(3)* %ptr0, i32 %val1) #0 {
+define amdgpu_kernel void @merge_store_ptr32_i32(i32 addrspace(3)* nocapture %a, i8 addrspace(3)* %ptr0, i32 %val1) #0 {
 entry:
   %a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)*
   %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
@@ -152,7 +152,7 @@ entry:
 ; CHECK: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr1 to i32
 ; CHECK: insertelement <2 x i32> %{{[^ ]+}}, i32 [[ELT1]], i32 1
 ; CHECK: store <2 x i32>
-define void @merge_store_i32_ptr32(i8 addrspace(3)* addrspace(3)* nocapture %a, i32 %val0, i8 addrspace(3)* %ptr1) #0 {
+define amdgpu_kernel void @merge_store_i32_ptr32(i8 addrspace(3)* addrspace(3)* nocapture %a, i32 %val0, i8 addrspace(3)* %ptr1) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i32 1
   %a.cast = bitcast i8 addrspace(3)* addrspace(3)* %a to i32 addrspace(3)*
@@ -166,7 +166,7 @@ entry:
 ; CHECK-LABEL: @no_merge_store_ptr32_i64(
 ; CHECK: store i8 addrspace(3)*
 ; CHECK: store i64
-define void @no_merge_store_ptr32_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(3)* %ptr0, i64 %val1) #0 {
+define amdgpu_kernel void @no_merge_store_ptr32_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(3)* %ptr0, i64 %val1) #0 {
 entry:
   %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(3)* addrspace(1)*
   %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
@@ -181,7 +181,7 @@ entry:
 ; CHECK-LABEL: @no_merge_store_i64_ptr32(
 ; CHECK: store i64
 ; CHECK: store i8 addrspace(3)*
-define void @no_merge_store_i64_ptr32(i8 addrspace(3)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(3)* %ptr1) #0 {
+define amdgpu_kernel void @no_merge_store_i64_ptr32(i8 addrspace(3)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(3)* %ptr1) #0 {
 entry:
   %a.1 =  getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %a, i64 1
   %a.cast = bitcast i8 addrspace(3)* addrspace(1)* %a to i64 addrspace(1)*
@@ -195,7 +195,7 @@ entry:
 ; CHECK-LABEL: @no_merge_load_i64_ptr32(
 ; CHECK: load i64,
 ; CHECK: load i8 addrspace(3)*,
-define void @no_merge_load_i64_ptr32(i64 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @no_merge_load_i64_ptr32(i64 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
   %a.1.cast = bitcast i64 addrspace(1)* %a.1 to i8 addrspace(3)* addrspace(1)*
@@ -209,7 +209,7 @@ entry:
 ; CHECK-LABEL: @no_merge_load_ptr32_i64(
 ; CHECK: load i8 addrspace(3)*,
 ; CHECK: load i64,
-define void @no_merge_load_ptr32_i64(i64 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @no_merge_load_ptr32_i64(i64 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(3)* addrspace(1)*
   %a.1 =  getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
@@ -226,7 +226,7 @@ entry:
 ; CHECK: load <2 x i8 addrspace(1)*>
 ; CHECK: store <2 x i8 addrspace(1)*>
 ; CHECK: store <2 x i8 addrspace(1)*>
-define void @merge_v2p1i8_v2p1i8(<2 x i8 addrspace(1)*> addrspace(1)* nocapture noalias %a, <2 x i8 addrspace(1)*> addrspace(1)* nocapture readonly noalias %b) #0 {
+define amdgpu_kernel void @merge_v2p1i8_v2p1i8(<2 x i8 addrspace(1)*> addrspace(1)* nocapture noalias %a, <2 x i8 addrspace(1)*> addrspace(1)* nocapture readonly noalias %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %b, i64 1
@@ -245,7 +245,7 @@ entry:
 ; CHECK: [[ELT0_INT:%[^ ]+]] = inttoptr i64 [[ELT0]] to i8 addrspace(1)*
 ; CHECK: [[ELT1_INT:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
 ; CHECK: bitcast i64 [[ELT1_INT]] to double
-define void @merge_load_ptr64_f64(double addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_ptr64_f64(double addrspace(1)* nocapture %a) #0 {
 entry:
   %a.cast = bitcast double addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
   %a.1 =  getelementptr inbounds double, double addrspace(1)* %a, i64 1
@@ -262,7 +262,7 @@ entry:
 ; CHECK: bitcast i64 [[ELT0]] to double
 ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
 ; CHECK: inttoptr i64 [[ELT1]] to i8 addrspace(1)*
-define void @merge_load_f64_ptr64(double addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_f64_ptr64(double addrspace(1)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
   %a.1.cast = bitcast double addrspace(1)* %a.1 to i8 addrspace(1)* addrspace(1)*
@@ -279,7 +279,7 @@ entry:
 ; CHECK: [[ELT1_INT:%[^ ]+]] = bitcast double %val1 to i64
 ; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1
 ; CHECK: store <2 x i64>
-define void @merge_store_ptr64_f64(double addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, double %val1) #0 {
+define amdgpu_kernel void @merge_store_ptr64_f64(double addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, double %val1) #0 {
 entry:
   %a.cast = bitcast double addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
   %a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
@@ -296,7 +296,7 @@ entry:
 ; CHECK: [[ELT1_INT:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr1 to i64
 ; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1
 ; CHECK: store <2 x i64>
-define void @merge_store_f64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, double %val0, i8 addrspace(1)* %ptr1) #0 {
+define amdgpu_kernel void @merge_store_f64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, double %val0, i8 addrspace(1)* %ptr1) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
   %a.cast = bitcast i8 addrspace(1)* addrspace(1)* %a to double addrspace(1)*
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll
index d70c449e14d7..63e688e63fbb 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll
@@ -9,7 +9,7 @@
 ; CHECK: store <4 x float>
 
 ; Function Attrs: nounwind
-define void @store_vectorize_with_alias(i8 addrspace(1)* %a, i8 addrspace(1)* %b) #0 {
+define amdgpu_kernel void @store_vectorize_with_alias(i8 addrspace(1)* %a, i8 addrspace(1)* %b) #0 {
 bb:
   %tmp = bitcast i8 addrspace(1)* %b to float addrspace(1)*
   %tmp1 = load float, float addrspace(1)* %tmp, align 4
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll
index 18f62be27c82..412d2013f6b6 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll
@@ -16,7 +16,7 @@ declare void @use_v2i9(<2 x i9>)
 ; CHECK-LABEL: @merge_store_2_constants_i1(
 ; CHECK: store i1
 ; CHECK: store i1
-define void @merge_store_2_constants_i1(i1 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_i1(i1 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i1, i1 addrspace(1)* %out, i32 1
   store i1 true, i1 addrspace(1)* %out.gep.1
   store i1 false, i1 addrspace(1)* %out
@@ -26,7 +26,7 @@ define void @merge_store_2_constants_i1(i1 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_store_2_constants_i2(
 ; CHECK: store i2 1
 ; CHECK: store i2 -1
-define void @merge_store_2_constants_i2(i2 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_i2(i2 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i2, i2 addrspace(1)* %out, i32 1
   store i2 1, i2 addrspace(1)* %out.gep.1
   store i2 -1, i2 addrspace(1)* %out
@@ -36,7 +36,7 @@ define void @merge_store_2_constants_i2(i2 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_different_store_sizes_i1_i8(
 ; CHECK: store i1 true
 ; CHECK: store i8 123
-define void @merge_different_store_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_different_store_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
   %out.i1 = bitcast i8 addrspace(1)* %out to i1 addrspace(1)*
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
   store i1 true, i1 addrspace(1)* %out.i1
@@ -47,7 +47,7 @@ define void @merge_different_store_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_different_store_sizes_i8_i1(
 ; CHECK: store i8 123
 ; CHECK: store i1 true
-define void @merge_different_store_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_different_store_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
   %out.i8 = bitcast i1 addrspace(1)* %out to i8 addrspace(1)*
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out.i8, i32 1
   store i8 123, i8 addrspace(1)* %out.gep.1
@@ -58,7 +58,7 @@ define void @merge_different_store_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_store_2_constant_structs(
 ; CHECK: store %struct.foo
 ; CHECK: store %struct.foo
-define void @merge_store_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr %struct.foo, %struct.foo addrspace(1)* %out, i32 1
   store %struct.foo { i32 12, i8 3 }, %struct.foo addrspace(1)* %out.gep.1
   store %struct.foo { i32 92, i8 9 }, %struct.foo addrspace(1)* %out
@@ -69,7 +69,7 @@ define void @merge_store_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_store_2_constants_v2i2(
 ; CHECK: store <2 x i2>
 ; CHECK: store <2 x i2>
-define void @merge_store_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <2 x i2>, <2 x i2> addrspace(1)* %out, i32 1
   store <2 x i2> <i2 1, i2 -1>, <2 x i2> addrspace(1)* %out.gep.1
   store <2 x i2> <i2 -1, i2 1>, <2 x i2> addrspace(1)* %out
@@ -81,7 +81,7 @@ define void @merge_store_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_store_2_constants_v4i2(
 ; CHECK: store <4 x i2>
 ; CHECK: store <4 x i2>
-define void @merge_store_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <4 x i2>, <4 x i2> addrspace(1)* %out, i32 1
   store <4 x i2> <i2 1, i2 -1, i2 1, i2 -1>, <4 x i2> addrspace(1)* %out.gep.1
   store <4 x i2> <i2 -1, i2 1, i2 -1, i2 1>, <4 x i2> addrspace(1)* %out
@@ -91,7 +91,7 @@ define void @merge_store_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_load_2_constants_i1(
 ; CHECK: load i1
 ; CHECK: load i1
-define void @merge_load_2_constants_i1(i1 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_i1(i1 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i1, i1 addrspace(1)* %out, i32 1
   %x = load i1, i1 addrspace(1)* %out.gep.1
   %y = load i1, i1 addrspace(1)* %out
@@ -103,7 +103,7 @@ define void @merge_load_2_constants_i1(i1 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_load_2_constants_i2(
 ; CHECK: load i2
 ; CHECK: load i2
-define void @merge_load_2_constants_i2(i2 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_i2(i2 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i2, i2 addrspace(1)* %out, i32 1
   %x = load i2, i2 addrspace(1)* %out.gep.1
   %y = load i2, i2 addrspace(1)* %out
@@ -115,7 +115,7 @@ define void @merge_load_2_constants_i2(i2 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_different_load_sizes_i1_i8(
 ; CHECK: load i1
 ; CHECK: load i8
-define void @merge_different_load_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_different_load_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
   %out.i1 = bitcast i8 addrspace(1)* %out to i1 addrspace(1)*
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
   %x = load i1, i1 addrspace(1)* %out.i1
@@ -128,7 +128,7 @@ define void @merge_different_load_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_different_load_sizes_i8_i1(
 ; CHECK: load i8
 ; CHECK: load i1
-define void @merge_different_load_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_different_load_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
   %out.i8 = bitcast i1 addrspace(1)* %out to i8 addrspace(1)*
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out.i8, i32 1
   %x = load i8, i8 addrspace(1)* %out.gep.1
@@ -141,7 +141,7 @@ define void @merge_different_load_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_load_2_constant_structs(
 ; CHECK: load %struct.foo
 ; CHECK: load %struct.foo
-define void @merge_load_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr %struct.foo, %struct.foo addrspace(1)* %out, i32 1
   %x = load %struct.foo, %struct.foo addrspace(1)* %out.gep.1
   %y = load %struct.foo, %struct.foo addrspace(1)* %out
@@ -153,7 +153,7 @@ define void @merge_load_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_load_2_constants_v2i2(
 ; CHECK: load <2 x i2>
 ; CHECK: load <2 x i2>
-define void @merge_load_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <2 x i2>, <2 x i2> addrspace(1)* %out, i32 1
   %x = load <2 x i2>, <2 x i2> addrspace(1)* %out.gep.1
   %y = load <2 x i2>, <2 x i2> addrspace(1)* %out
@@ -165,7 +165,7 @@ define void @merge_load_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_load_2_constants_v4i2(
 ; CHECK: load <4 x i2>
 ; CHECK: load <4 x i2>
-define void @merge_load_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <4 x i2>, <4 x i2> addrspace(1)* %out, i32 1
   %x = load <4 x i2>, <4 x i2> addrspace(1)* %out.gep.1
   %y = load <4 x i2>, <4 x i2> addrspace(1)* %out
@@ -177,7 +177,7 @@ define void @merge_load_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_store_2_constants_i9(
 ; CHECK: store i9 3
 ; CHECK: store i9 -5
-define void @merge_store_2_constants_i9(i9 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_i9(i9 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i9, i9 addrspace(1)* %out, i32 1
   store i9 3, i9 addrspace(1)* %out.gep.1
   store i9 -5, i9 addrspace(1)* %out
@@ -187,7 +187,7 @@ define void @merge_store_2_constants_i9(i9 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_load_2_constants_v2i9(
 ; CHECK: load <2 x i9>
 ; CHECK: load <2 x i9>
-define void @merge_load_2_constants_v2i9(<2 x i9> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_v2i9(<2 x i9> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <2 x i9>, <2 x i9> addrspace(1)* %out, i32 1
   %x = load <2 x i9>, <2 x i9> addrspace(1)* %out.gep.1
   %y = load <2 x i9>, <2 x i9> addrspace(1)* %out
diff --git a/test/Transforms/LoadStoreVectorizer/X86/load-width.ll b/test/Transforms/LoadStoreVectorizer/X86/load-width.ll
new file mode 100644
index 000000000000..a61b25119a14
--- /dev/null
+++ b/test/Transforms/LoadStoreVectorizer/X86/load-width.ll
@@ -0,0 +1,38 @@
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu haswell -S -o - %s | FileCheck --check-prefix=CHECK-HSW %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu knl -S -o - %s | FileCheck --check-prefix=CHECK-KNL %s
+
+define <8 x double> @loadwidth_insert_extract(double* %ptr) {
+    %a = bitcast double* %ptr to <2 x double> *
+    %b = getelementptr <2 x double>, <2 x double>* %a, i32 1
+    %c = getelementptr <2 x double>, <2 x double>* %a, i32 2
+    %d = getelementptr <2 x double>, <2 x double>* %a, i32 3
+; CHECK-HSW: load <4 x double>
+; CHECK-HSW: load <4 x double>
+; CHECK-HSW-NOT: load
+; CHECK-KNL: load <8 x double>
+; CHECK-KNL-NOT: load
+    %la = load <2 x double>, <2 x double> *%a
+    %lb = load <2 x double>, <2 x double> *%b
+    %lc = load <2 x double>, <2 x double> *%c
+    %ld = load <2 x double>, <2 x double> *%d
+    ; Scalarize everything - Explicitly not a shufflevector to test this code
+    ; path in the LSV
+    %v1 = extractelement <2 x double> %la, i32 0
+    %v2 = extractelement <2 x double> %la, i32 1
+    %v3 = extractelement <2 x double> %lb, i32 0
+    %v4 = extractelement <2 x double> %lb, i32 1
+    %v5 = extractelement <2 x double> %lc, i32 0
+    %v6 = extractelement <2 x double> %lc, i32 1
+    %v7 = extractelement <2 x double> %ld, i32 0
+    %v8 = extractelement <2 x double> %ld, i32 1
+    ; Make a vector again
+    %i1 = insertelement <8 x double> undef, double %v1, i32 0
+    %i2 = insertelement <8 x double> %i1, double %v2, i32 1
+    %i3 = insertelement <8 x double> %i2, double %v3, i32 2
+    %i4 = insertelement <8 x double> %i3, double %v4, i32 3
+    %i5 = insertelement <8 x double> %i4, double %v5, i32 4
+    %i6 = insertelement <8 x double> %i5, double %v6, i32 5
+    %i7 = insertelement <8 x double> %i6, double %v7, i32 6
+    %i8 = insertelement <8 x double> %i7, double %v8, i32 7
+    ret <8 x double> %i8
+}
diff --git a/test/Transforms/LoopDeletion/invalidation.ll b/test/Transforms/LoopDeletion/invalidation.ll
new file mode 100644
index 000000000000..5564f90e1ea7
--- /dev/null
+++ b/test/Transforms/LoopDeletion/invalidation.ll
@@ -0,0 +1,42 @@
+; Ensure we don't run analyses over loops after they've been deleted. We run
+; one version with a no-op loop pass to make sure that the loop doesn't get
+; simplified away.
+;
+; RUN: opt < %s -passes='require<ivusers>,no-op-loop,require<ivusers>' -S \
+; RUN:     | FileCheck %s --check-prefixes=CHECK,BEFORE
+; RUN: opt < %s -passes='require<ivusers>,loop-deletion,require<ivusers>' -S \
+; RUN:     | FileCheck %s --check-prefixes=CHECK,AFTER
+
+
+define void @foo(i64 %n, i64 %m) nounwind {
+; CHECK-LABEL: @foo(
+
+entry:
+  br label %bb
+; CHECK:       entry:
+; BEFORE-NEXT:   br label %bb
+; AFTER-NEXT:    br label %return
+
+bb:
+  %x.0 = phi i64 [ 0, %entry ], [ %t0, %bb2 ]
+  %t0 = add i64 %x.0, 1
+  %t1 = icmp slt i64 %x.0, %n
+  br i1 %t1, label %bb2, label %return
+; BEFORE:      bb:
+; BEFORE:        br i1 {{.*}}, label %bb2, label %return
+; AFTER-NOT:   bb:
+; AFTER-NOT:     br
+
+bb2:
+  %t2 = icmp slt i64 %x.0, %m
+  br i1 %t1, label %bb, label %return
+; BEFORE:      bb2:
+; BEFORE:        br i1 {{.*}}, label %bb, label %return
+; AFTER-NOT:   bb2:
+; AFTER-NOT:     br
+
+return:
+  ret void
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+}
diff --git a/test/Transforms/LoopDeletion/multiple-exit-conditions.ll b/test/Transforms/LoopDeletion/multiple-exit-conditions.ll
index d7d6badb1650..e7b47211d570 100644
--- a/test/Transforms/LoopDeletion/multiple-exit-conditions.ll
+++ b/test/Transforms/LoopDeletion/multiple-exit-conditions.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -loop-deletion -S | FileCheck %s
-; RUN: opt < %s -passes='require<scalar-evolution>,loop(loop-deletion)' -S | FileCheck %s
+; RUN: opt < %s -passes='loop(loop-deletion)' -S | FileCheck %s
 
 ; ScalarEvolution can prove the loop iteration is finite, even though
 ; it can't represent the exact trip count as an expression. That's
diff --git a/test/Transforms/LoopDeletion/multiple-exits.ll b/test/Transforms/LoopDeletion/multiple-exits.ll
index dcf79057db54..760c3aae4ee7 100644
--- a/test/Transforms/LoopDeletion/multiple-exits.ll
+++ b/test/Transforms/LoopDeletion/multiple-exits.ll
@@ -1,80 +1,138 @@
-; RUN: opt < %s -loop-deletion -S | FileCheck %s
+; Checks whether dead loops with multiple exits can be eliminated.
+; Note that we loop simplify and LCSSA over the test cases to make sure the
+; critical components remain after those passes and are visible to the loop
+; deletion pass.
+;
+; RUN: opt < %s -loop-simplify -lcssa -S | FileCheck %s --check-prefixes=CHECK,BEFORE
+; RUN: opt < %s -loop-deletion -S | FileCheck %s --check-prefixes=CHECK,AFTER
+;
+; RUN: opt < %s -passes=no-op-loop -S | FileCheck %s --check-prefixes=CHECK,BEFORE
+; RUN: opt < %s -passes=loop-deletion -S | FileCheck %s --check-prefixes=CHECK,AFTER
 
-; Checks whether dead loops with multiple exits can be eliminated
 
 define void @foo(i64 %n, i64 %m) nounwind {
 ; CHECK-LABEL: @foo(
-; CHECK:      entry:
-; CHECK-NEXT:   br label %return
 
-; CHECK:      return:
-; CHECK-NEXT:   ret void
 entry:
   br label %bb
+; CHECK:       entry:
+; BEFORE-NEXT:   br label %bb
+; AFTER-NEXT:    br label %return
 
 bb:
   %x.0 = phi i64 [ 0, %entry ], [ %t0, %bb2 ]
   %t0 = add i64 %x.0, 1
   %t1 = icmp slt i64 %x.0, %n
   br i1 %t1, label %bb2, label %return
+; BEFORE:      bb:
+; BEFORE:        br i1 {{.*}}, label %bb2, label %return
+; AFTER-NOT:   bb:
+; AFTER-NOT:     br
+
 bb2:
   %t2 = icmp slt i64 %x.0, %m
   br i1 %t1, label %bb, label %return
+; BEFORE:      bb2:
+; BEFORE:        br i1 {{.*}}, label %bb, label %return
+; AFTER-NOT:   bb2:
+; AFTER-NOT:     br
 
 return:
   ret void
+; CHECK:       return:
+; CHECK-NEXT:    ret void
 }
 
-define i64 @bar(i64 %n, i64 %m) nounwind {
-; CHECK-LABEL:  @bar(
-; CHECK: entry:
-; CHECK-NEXT:  br label %return
-
-; CHECK: return:
-; CHECK-NEXT:  ret i64 10
+define i64 @bar(i64 %n, i64 %m, i64 %maybe_zero) nounwind {
+; CHECK-LABEL: @bar(
 
 entry:
   br label %bb
+; CHECK:       entry:
+; BEFORE-NEXT:   br label %bb
+; AFTER-NEXT:    br label %return
 
 bb:
   %x.0 = phi i64 [ 0, %entry ], [ %t0, %bb3 ]
   %t0 = add i64 %x.0, 1
   %t1 = icmp slt i64 %x.0, %n
   br i1 %t1, label %bb2, label %return
+; BEFORE:      bb:
+; BEFORE:        br i1 {{.*}}, label %bb2, label %return
+; AFTER-NOT:   bb:
+; AFTER-NOT:     br
+
 bb2:
   %t2 = icmp slt i64 %x.0, %m
+  ; This unused division prevents unifying this loop exit path with others
+  ; because it can be deleted but cannot be hoisted.
+  %unused1 = udiv i64 42, %maybe_zero
   br i1 %t2, label %bb3, label %return
+; BEFORE:      bb2:
+; BEFORE:        br i1 {{.*}}, label %bb3, label %return
+; AFTER-NOT:   bb2:
+; AFTER-NOT:     br
+
 bb3:
   %t3 = icmp slt i64 %x.0, %m
+  ; This unused division prevents unifying this loop exit path with others
+  ; because it can be deleted but cannot be hoisted.
+  %unused2 = sdiv i64 42, %maybe_zero
   br i1 %t3, label %bb, label %return
+; BEFORE:      bb3:
+; BEFORE:        br i1 {{.*}}, label %bb, label %return
+; AFTER-NOT:   bb3:
+; AFTER-NOT:     br
 
 return:
   %x.lcssa = phi i64 [ 10, %bb ], [ 10, %bb2 ], [ 10, %bb3 ]
   ret i64 %x.lcssa
+; CHECK:       return:
+; BEFORE-NEXT:   %[[X:.*]] = phi i64 [ 10, %bb ], [ 10, %bb2 ], [ 10, %bb3 ]
+; AFTER-NEXT:    %[[X:.*]] = phi i64 [ 10, %entry ]
+; CHECK-NEXT:    ret i64 %[[X]]
 }
 
-define i64 @baz(i64 %n, i64 %m) nounwind {
+; This function has a loop which looks like @bar's but that cannot be deleted
+; because which path we exit through determines which value is selected.
+define i64 @baz(i64 %n, i64 %m, i64 %maybe_zero) nounwind {
 ; CHECK-LABEL:  @baz(
-; CHECK: return:
-; CHECK-NEXT:  %x.lcssa = phi i64 [ 12, %bb ], [ 10, %bb2 ]
-; CHECK-NEXT:  ret i64 %x.lcssa
 
 entry:
   br label %bb
+; CHECK:       entry:
+; CHECK-NEXT:    br label %bb
 
 bb:
   %x.0 = phi i64 [ 0, %entry ], [ %t0, %bb3 ]
   %t0 = add i64 %x.0, 1
   %t1 = icmp slt i64 %x.0, %n
   br i1 %t1, label %bb2, label %return
+; CHECK:       bb:
+; CHECK:         br i1 {{.*}}, label %bb2, label %return
+
 bb2:
   %t2 = icmp slt i64 %x.0, %m
+  ; This unused division prevents unifying this loop exit path with others
+  ; because it can be deleted but cannot be hoisted.
+  %unused1 = udiv i64 42, %maybe_zero
   br i1 %t2, label %bb3, label %return
+; CHECK:       bb2:
+; CHECK:         br i1 {{.*}}, label %bb3, label %return
+
 bb3:
   %t3 = icmp slt i64 %x.0, %m
+  ; This unused division prevents unifying this loop exit path with others
+  ; because it can be deleted but cannot be hoisted.
+  %unused2 = sdiv i64 42, %maybe_zero
   br i1 %t3, label %bb, label %return
+; CHECK:       bb3:
+; CHECK:         br i1 {{.*}}, label %bb, label %return
 
 return:
   %x.lcssa = phi i64 [ 12, %bb ], [ 10, %bb2 ], [ 10, %bb3 ]
   ret i64 %x.lcssa
+; CHECK: return:
+; CHECK-NEXT:  %[[X:.*]] = phi i64 [ 12, %bb ], [ 10, %bb2 ], [ 10, %bb3 ]
+; CHECK-NEXT:  ret i64 %[[X]]
 }
diff --git a/test/Transforms/LoopDistribute/diagnostics-with-hotness-lazy-BFI.ll b/test/Transforms/LoopDistribute/diagnostics-with-hotness-lazy-BFI.ll
deleted file mode 100644
index bb6459acc062..000000000000
--- a/test/Transforms/LoopDistribute/diagnostics-with-hotness-lazy-BFI.ll
+++ /dev/null
@@ -1,88 +0,0 @@
-; Check that BFI is not computed when -pass-remarks-with-hotness is off
-
-; RUN: opt -loop-distribute -enable-loop-distribute -S -pass-remarks-missed=loop-distribute \
-; RUN:     -debug-only=block-freq,branch-prob -pass-remarks-with-hotness \
-; RUN:     < %s 2>&1 | FileCheck %s --check-prefix=HOTNESS
-; RUN: opt -loop-distribute -enable-loop-distribute -S -pass-remarks-missed=loop-distribute \
-; RUN:     -debug-only=block-freq,branch-prob \
-; RUN:     < %s 2>&1 | FileCheck %s --check-prefix=NO_HOTNESS
-
-; RUN: opt -passes='require<aa>,loop-distribute' -S -pass-remarks-missed=loop-distribute \
-; RUN:     -debug-only=block-freq,branch-prob -pass-remarks-with-hotness \
-; RUN:     < %s 2>&1 | FileCheck %s --check-prefix=HOTNESS
-; RUN: opt -passes='require<aa>,loop-distribute' -S -pass-remarks-missed=loop-distribute \
-; RUN:     -debug-only=block-freq,branch-prob \
-; RUN:     < %s 2>&1 | FileCheck %s --check-prefix=NO_HOTNESS
-
-; REQUIRES: asserts
-
-; HOTNESS: Branch Probability Info : forced
-; HOTNESS: block-frequency: forced
-; NO_HOTNESS-NOT: Branch Probability Info : forced
-; NO_HOTNESS-NOT: block-frequency: forced
-
-; This is the input program:
-;
-;     1	void forced (char *A, char *B, char *C, int N) {
-;     2	#pragma clang loop distribute(enable)
-;     3	  for(int i = 0; i < N; i++) {
-;     4	    A[i] = B[i] * C[i];
-;     5	  }
-;     6	}
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.11.0"
-
-
-define void @forced(i8* %A, i8* %B, i8* %C, i32 %N) !dbg !7 !prof !22 {
-entry:
-  %cmp12 = icmp sgt i32 %N, 0, !dbg !9
-  br i1 %cmp12, label %ph, label %for.cond.cleanup, !dbg !10, !prof !23
-
-ph:
-  br label %for.body
-
-for.body:
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %ph ]
-  %arrayidx = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !12
-  %0 = load i8, i8* %arrayidx, align 1, !dbg !12, !tbaa !13
-  %arrayidx2 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !16
-  %1 = load i8, i8* %arrayidx2, align 1, !dbg !16, !tbaa !13
-  %mul = mul i8 %1, %0, !dbg !17
-  %arrayidx6 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !18
-  store i8 %mul, i8* %arrayidx6, align 1, !dbg !19, !tbaa !13
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !10
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !10
-  %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !10
-  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !10, !llvm.loop !20, !prof !24
-
-for.cond.cleanup:
-  ret void, !dbg !11
-}
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 267633) (llvm/trunk 267675)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
-!1 = !DIFile(filename: "/tmp/t.c", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 2}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!7 = distinct !DISubprogram(name: "forced", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !2)
-!8 = !DISubroutineType(types: !2)
-!9 = !DILocation(line: 3, column: 20, scope: !7)
-!10 = !DILocation(line: 3, column: 3, scope: !7)
-!11 = !DILocation(line: 6, column: 1, scope: !7)
-!12 = !DILocation(line: 4, column: 12, scope: !7)
-!13 = !{!14, !14, i64 0}
-!14 = !{!"omnipotent char", !15, i64 0}
-!15 = !{!"Simple C/C++ TBAA"}
-!16 = !DILocation(line: 4, column: 19, scope: !7)
-!17 = !DILocation(line: 4, column: 17, scope: !7)
-!18 = !DILocation(line: 4, column: 5, scope: !7)
-!19 = !DILocation(line: 4, column: 10, scope: !7)
-!20 = distinct !{!20, !21}
-!21 = !{!"llvm.loop.distribute.enable", i1 true}
-!22 = !{!"function_entry_count", i64 3}
-!23 = !{!"branch_weights", i32 99, i32 1}
-!24 = !{!"branch_weights", i32 1, i32 99}
diff --git a/test/Transforms/LoopIdiom/unroll.ll b/test/Transforms/LoopIdiom/unroll.ll
index 0cdfda254d78..5981c3e4492f 100644
--- a/test/Transforms/LoopIdiom/unroll.ll
+++ b/test/Transforms/LoopIdiom/unroll.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
-; CHECK @.memset_pattern = private unnamed_addr constant [4 x i32] [i32 2, i32 2, i32 2, i32 2], align 16
+; CHECK: @.memset_pattern = private unnamed_addr constant [4 x i32] [i32 2, i32 2, i32 2, i32 2], align 16
 
 target triple = "x86_64-apple-darwin10.0.0"
 
diff --git a/test/Transforms/LoopLoadElim/backward.ll b/test/Transforms/LoopLoadElim/backward.ll
index 7c750a51a2a3..c0cec75bdd37 100644
--- a/test/Transforms/LoopLoadElim/backward.ll
+++ b/test/Transforms/LoopLoadElim/backward.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -loop-load-elim -S < %s | FileCheck %s
+; RUN: opt -passes=loop-load-elim -S < %s | FileCheck %s
 
 ; Simple st->ld forwarding derived from a lexical backward dep.
 ;
diff --git a/test/Transforms/LoopLoadElim/forward.ll b/test/Transforms/LoopLoadElim/forward.ll
index 9a0e03a317c8..0b270cab3edc 100644
--- a/test/Transforms/LoopLoadElim/forward.ll
+++ b/test/Transforms/LoopLoadElim/forward.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -loop-load-elim -S < %s | FileCheck %s
+; RUN: opt -passes=loop-load-elim -S < %s | FileCheck %s
 
 ; Simple st->ld forwarding derived from a lexical forward dep.
 ;
diff --git a/test/Transforms/LoopPredication/basic.ll b/test/Transforms/LoopPredication/basic.ll
new file mode 100644
index 000000000000..6ce07819cb03
--- /dev/null
+++ b/test/Transforms/LoopPredication/basic.ll
@@ -0,0 +1,571 @@
+; RUN: opt -S -loop-predication < %s 2>&1 | FileCheck %s
+; RUN: opt -S -passes='require<scalar-evolution>,loop(loop-predication)' < %s 2>&1 | FileCheck %s
+
+declare void @llvm.experimental.guard(i1, ...)
+
+define i32 @unsigned_loop_0_to_n_ult_check(i32* %array, i32 %length, i32 %n) {
+; CHECK-LABEL: @unsigned_loop_0_to_n_ult_check
+entry:
+  %tmp5 = icmp eq i32 %n, 0
+  br i1 %tmp5, label %exit, label %loop.preheader
+
+loop.preheader:
+; CHECK: loop.preheader:
+; CHECK: [[max_index:[^ ]+]] = add i32 %n, -1
+; CHECK-NEXT: [[wide_cond:[^ ]+]] = icmp ult i32 [[max_index]], %length
+; CHECK-NEXT: br label %loop
+  br label %loop
+
+loop:
+; CHECK: loop:
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond]], i32 9) [ "deopt"() ]
+  %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %loop ], [ 0, %loop.preheader ]
+  %within.bounds = icmp ult i32 %i, %length
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
+
+  %i.i64 = zext i32 %i to i64
+  %array.i.ptr = getelementptr inbounds i32, i32* %array, i64 %i.i64
+  %array.i = load i32, i32* %array.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc, %array.i
+
+  %i.next = add nuw i32 %i, 1
+  %continue = icmp ult i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %loop.acc.next, %loop ]
+  ret i32 %result
+}
+
+define i32 @unsigned_loop_0_to_n_ugt_check(i32* %array, i32 %length, i32 %n) {
+; CHECK-LABEL: @unsigned_loop_0_to_n_ugt_check
+entry:
+  %tmp5 = icmp eq i32 %n, 0
+  br i1 %tmp5, label %exit, label %loop.preheader
+
+loop.preheader:
+; CHECK: loop.preheader:
+; CHECK: [[max_index:[^ ]+]] = add i32 %n, -1
+; CHECK-NEXT: [[wide_cond:[^ ]+]] = icmp ult i32 [[max_index]], %length
+; CHECK-NEXT: br label %loop
+  br label %loop
+
+loop:
+; CHECK: loop:
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond]], i32 9) [ "deopt"() ]
+  %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %loop ], [ 0, %loop.preheader ]
+  %within.bounds = icmp ugt i32 %length, %i
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
+
+  %i.i64 = zext i32 %i to i64
+  %array.i.ptr = getelementptr inbounds i32, i32* %array, i64 %i.i64
+  %array.i = load i32, i32* %array.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc, %array.i
+
+  %i.next = add nuw i32 %i, 1
+  %continue = icmp ult i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %loop.acc.next, %loop ]
+  ret i32 %result
+}
+
+
+define i32 @two_range_checks(i32* %array.1, i32 %length.1,
+                             i32* %array.2, i32 %length.2, i32 %n) {
+; CHECK-LABEL: @two_range_checks
+entry:
+  %tmp5 = icmp eq i32 %n, 0
+  br i1 %tmp5, label %exit, label %loop.preheader
+
+loop.preheader:
+; CHECK: loop.preheader:
+; CHECK: [[max_index:[^ ]+]] = add i32 %n, -1
+; CHECK-NEXT: [[wide_cond_1:[^ ]+]] = icmp ult i32 [[max_index]], %length.{{1|2}}
+; CHECK-NEXT: [[wide_cond_2:[^ ]+]] = icmp ult i32 [[max_index]], %length.{{1|2}}
+; CHECK-NEXT: br label %loop
+  br label %loop
+
+loop:
+; CHECK: loop:
+; CHECK: [[wide_cond:[^ ]+]] = and i1 [[wide_cond_1]], [[wide_cond_2]]
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond]], i32 9) [ "deopt"() ]
+  %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %loop ], [ 0, %loop.preheader ]
+  %within.bounds.1 = icmp ult i32 %i, %length.1
+  %within.bounds.2 = icmp ult i32 %i, %length.2
+  %within.bounds = and i1 %within.bounds.1, %within.bounds.2
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
+
+  %i.i64 = zext i32 %i to i64
+  %array.1.i.ptr = getelementptr inbounds i32, i32* %array.1, i64 %i.i64
+  %array.1.i = load i32, i32* %array.1.i.ptr, align 4
+  %loop.acc.1 = add i32 %loop.acc, %array.1.i
+
+  %array.2.i.ptr = getelementptr inbounds i32, i32* %array.2, i64 %i.i64
+  %array.2.i = load i32, i32* %array.2.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc.1, %array.2.i
+
+  %i.next = add nuw i32 %i, 1
+  %continue = icmp ult i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %loop.acc.next, %loop ]
+  ret i32 %result
+}
+
+define i32 @three_range_checks(i32* %array.1, i32 %length.1,
+                               i32* %array.2, i32 %length.2,
+                               i32* %array.3, i32 %length.3, i32 %n) {
+; CHECK-LABEL: @three_range_checks
+entry:
+  %tmp5 = icmp eq i32 %n, 0
+  br i1 %tmp5, label %exit, label %loop.preheader
+
+loop.preheader:
+; CHECK: loop.preheader:
+; CHECK: [[max_index:[^ ]+]] = add i32 %n, -1
+; CHECK-NEXT: [[wide_cond_1:[^ ]+]] = icmp ult i32 [[max_index]], %length.{{1|2|3}}
+; CHECK-NEXT: [[wide_cond_2:[^ ]+]] = icmp ult i32 [[max_index]], %length.{{1|2|3}}
+; CHECK-NEXT: [[wide_cond_3:[^ ]+]] = icmp ult i32 [[max_index]], %length.{{1|2|3}}
+; CHECK-NEXT: br label %loop
+  br label %loop
+
+loop:
+; CHECK: loop:
+; CHECK: [[wide_cond_and:[^ ]+]] = and i1 [[wide_cond_1]], [[wide_cond_2]]
+; CHECK-NEXT: [[wide_cond:[^ ]+]] = and i1 [[wide_cond_and]], [[wide_cond_3]]
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond]], i32 9) [ "deopt"() ]
+  %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %loop ], [ 0, %loop.preheader ]
+  %within.bounds.1 = icmp ult i32 %i, %length.1
+  %within.bounds.2 = icmp ult i32 %i, %length.2
+  %within.bounds.3 = icmp ult i32 %i, %length.3
+  %within.bounds.1.and.2 = and i1 %within.bounds.1, %within.bounds.2
+  %within.bounds = and i1 %within.bounds.1.and.2, %within.bounds.3
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
+
+  %i.i64 = zext i32 %i to i64
+  %array.1.i.ptr = getelementptr inbounds i32, i32* %array.1, i64 %i.i64
+  %array.1.i = load i32, i32* %array.1.i.ptr, align 4
+  %loop.acc.1 = add i32 %loop.acc, %array.1.i
+
+  %array.2.i.ptr = getelementptr inbounds i32, i32* %array.2, i64 %i.i64
+  %array.2.i = load i32, i32* %array.2.i.ptr, align 4
+  %loop.acc.2 = add i32 %loop.acc.1, %array.2.i
+
+  %array.3.i.ptr = getelementptr inbounds i32, i32* %array.3, i64 %i.i64
+  %array.3.i = load i32, i32* %array.3.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc.2, %array.3.i
+
+  %i.next = add nuw i32 %i, 1
+  %continue = icmp ult i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %loop.acc.next, %loop ]
+  ret i32 %result
+}
+
+define i32 @three_guards(i32* %array.1, i32 %length.1,
+                         i32* %array.2, i32 %length.2,
+                         i32* %array.3, i32 %length.3, i32 %n) {
+; CHECK-LABEL: @three_guards
+entry:
+  %tmp5 = icmp eq i32 %n, 0
+  br i1 %tmp5, label %exit, label %loop.preheader
+
+loop.preheader:
+; CHECK: loop.preheader:
+; CHECK: [[max_index:[^ ]+]] = add i32 %n, -1
+; CHECK-NEXT: [[wide_cond_1:[^ ]+]] = icmp ult i32 [[max_index]], %length.1
+; CHECK-NEXT: [[wide_cond_2:[^ ]+]] = icmp ult i32 [[max_index]], %length.2
+; CHECK-NEXT: [[wide_cond_3:[^ ]+]] = icmp ult i32 [[max_index]], %length.3
+; CHECK-NEXT: br label %loop
+  br label %loop
+
+loop:
+; CHECK: loop:
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond_1]], i32 9) [ "deopt"() ]
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond_2]], i32 9) [ "deopt"() ]
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond_3]], i32 9) [ "deopt"() ]
+
+  %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %loop ], [ 0, %loop.preheader ]
+
+  %within.bounds.1 = icmp ult i32 %i, %length.1
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds.1, i32 9) [ "deopt"() ]
+
+  %i.i64 = zext i32 %i to i64
+  %array.1.i.ptr = getelementptr inbounds i32, i32* %array.1, i64 %i.i64
+  %array.1.i = load i32, i32* %array.1.i.ptr, align 4
+  %loop.acc.1 = add i32 %loop.acc, %array.1.i
+
+  %within.bounds.2 = icmp ult i32 %i, %length.2
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds.2, i32 9) [ "deopt"() ]
+
+  %array.2.i.ptr = getelementptr inbounds i32, i32* %array.2, i64 %i.i64
+  %array.2.i = load i32, i32* %array.2.i.ptr, align 4
+  %loop.acc.2 = add i32 %loop.acc.1, %array.2.i
+
+  %within.bounds.3 = icmp ult i32 %i, %length.3
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds.3, i32 9) [ "deopt"() ]
+
+  %array.3.i.ptr = getelementptr inbounds i32, i32* %array.3, i64 %i.i64
+  %array.3.i = load i32, i32* %array.3.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc.2, %array.3.i
+
+  %i.next = add nuw i32 %i, 1
+  %continue = icmp ult i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %loop.acc.next, %loop ]
+  ret i32 %result
+}
+
+define i32 @signed_loop_start_to_n_sge_0_check(i32* %array, i32 %length, i32 %start, i32 %n) {
+; CHECK-LABEL: @signed_loop_start_to_n_sge_0_check
+entry:
+  %tmp5 = icmp eq i32 %n, 0
+  br i1 %tmp5, label %exit, label %loop.preheader
+
+loop.preheader:
+; CHECK: loop.preheader:
+; CHECK-NEXT: [[wide_cond:[^ ]+]] = icmp sge i32 %start, 0
+; CHECK-NEXT: br label %loop
+  br label %loop
+
+loop:
+; CHECK: loop:
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond]], i32 9) [ "deopt"() ]
+  %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %loop ], [ %start, %loop.preheader ]
+  %within.bounds = icmp sge i32 %i, 0
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
+
+  %i.i64 = zext i32 %i to i64
+  %array.i.ptr = getelementptr inbounds i32, i32* %array, i64 %i.i64
+  %array.i = load i32, i32* %array.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc, %array.i
+
+  %i.next = add nsw i32 %i, 1
+  %continue = icmp slt i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %loop.acc.next, %loop ]
+  ret i32 %result
+}
+
+define i32 @signed_loop_start_to_n_upper_slt_length_check(i32* %array, i32 %length, i32 %start, i32 %n) {
+; CHECK-LABEL: @signed_loop_start_to_n_upper_slt_length_check
+entry:
+  %tmp5 = icmp sle i32 %n, 0
+  br i1 %tmp5, label %exit, label %loop.preheader
+
+loop.preheader:
+; CHECK: loop.preheader:
+; CHECK: [[start_1:[^ ]+]] = add i32 %start, 1
+; CHECK-NEXT: [[n_sgt_start_1:[^ ]+]] = icmp sgt i32 %n, [[start_1]]
+; CHECK-NEXT: [[smax:[^ ]+]] = select i1 [[n_sgt_start_1]], i32 %n, i32 [[start_1]]
+; CHECK-NEXT: [[max_index:[^ ]+]] = add i32 [[smax]], -1
+; CHECK-NEXT: [[wide_cond:[^ ]+]] = icmp slt i32 [[max_index]], %length
+; CHECK-NEXT: br label %loop
+  br label %loop
+
+loop:
+; CHECK: loop:
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond]], i32 9) [ "deopt"() ]
+  %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %loop ], [ %start, %loop.preheader ]
+  %within.bounds = icmp slt i32 %i, %length
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
+
+  %i.i64 = zext i32 %i to i64
+  %array.i.ptr = getelementptr inbounds i32, i32* %array, i64 %i.i64
+  %array.i = load i32, i32* %array.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc, %array.i
+
+  %i.next = add nsw i32 %i, 1
+  %continue = icmp slt i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %loop.acc.next, %loop ]
+  ret i32 %result
+}
+
+define i32 @signed_loop_start_to_n_both_checks(i32* %array, i32 %length, i32 %start, i32 %n) {
+; CHECK-LABEL: @signed_loop_start_to_n_both_checks
+entry:
+  %tmp5 = icmp sle i32 %n, 0
+  br i1 %tmp5, label %exit, label %loop.preheader
+
+loop.preheader:
+; CHECK: loop.preheader:
+; CHECK: [[lower_check:[^ ]+]] = icmp sge i32 %start, 0
+; CHECK-NEXT: [[start_1:[^ ]+]] = add i32 %start, 1
+; CHECK-NEXT: [[n_sgt_start_1:[^ ]+]] = icmp sgt i32 %n, [[start_1]]
+; CHECK-NEXT: [[smax:[^ ]+]] = select i1 [[n_sgt_start_1]], i32 %n, i32 [[start_1]]
+; CHECK-NEXT: [[max_index:[^ ]+]] = add i32 [[smax]], -1
+; CHECK-NEXT: [[upper_check:[^ ]+]] = icmp slt i32 [[max_index]], %length
+; CHECK-NEXT: br label %loop
+  br label %loop
+
+loop:
+; CHECK: loop:
+; CHECK: [[wide_cond:[^ ]+]] = and i1 [[lower_check]], [[upper_check]]
+; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond]], i32 9) [ "deopt"() ]
+  %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %loop ], [ %start, %loop.preheader ]
+  %within.bounds.1 = icmp slt i32 %i, %length
+  %within.bounds.2 = icmp sge i32 %i, 0
+  %within.bounds = and i1 %within.bounds.1, %within.bounds.2
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
+
+  %i.i64 = zext i32 %i to i64
+  %array.i.ptr = getelementptr inbounds i32, i32* %array, i64 %i.i64
+  %array.i = load i32, i32* %array.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc, %array.i
+
+  %i.next = add nsw i32 %i, 1
+  %continue = icmp slt i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %loop.acc.next, %loop ]
+  ret i32 %result
+}
+
+define i32 @unsigned_loop_0_to_n_unrelated_condition(i32* %array, i32 %length, i32 %n, i32 %x) {
+; CHECK-LABEL: @unsigned_loop_0_to_n_unrelated_condition
+entry:
+  %tmp5 = icmp eq i32 %n, 0
+  br i1 %tmp5, label %exit, label %loop.preheader
+
+loop.preheader:
+; CHECK: loop.preheader:
+; CHECK: [[max_index:[^ ]+]] = add i32 %n, -1
+; CHECK-NEXT: [[wide_cond:[^ ]+]] = icmp ult i32 [[max_index]], %length
+; CHECK-NEXT: br label %loop
+  br label %loop
+
+loop:
+; CHECK: loop:
+; CHECK: %unrelated.cond = icmp ult i32 %x, %length
+; CHECK: [[guard_cond:[^ ]+]] = and i1 %unrelated.cond, [[wide_cond]]
+; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[guard_cond]], i32 9) [ "deopt"() ]
+  %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %loop ], [ 0, %loop.preheader ]
+  %within.bounds = icmp ult i32 %i, %length
+  %unrelated.cond = icmp ult i32 %x, %length
+  %guard.cond = and i1 %within.bounds, %unrelated.cond
+  call void (i1, ...) @llvm.experimental.guard(i1 %guard.cond, i32 9) [ "deopt"() ]
+
+  %i.i64 = zext i32 %i to i64
+  %array.i.ptr = getelementptr inbounds i32, i32* %array, i64 %i.i64
+  %array.i = load i32, i32* %array.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc, %array.i
+
+  %i.next = add nuw i32 %i, 1
+  %continue = icmp ult i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %loop.acc.next, %loop ]
+  ret i32 %result
+}
+
+; Don't change the guard condition if there were no widened subconditions
+define i32 @test_no_widened_conditions(i32* %array, i32 %length, i32 %n, i32 %x1, i32 %x2, i32 %x3) {
+; CHECK-LABEL: @test_no_widened_conditions
+entry:
+  %tmp5 = icmp eq i32 %n, 0
+  br i1 %tmp5, label %exit, label %loop.preheader
+
+loop.preheader:
+; CHECK: loop.preheader:
+; CHECK-NEXT: br label %loop
+  br label %loop
+
+loop:
+; CHECK: loop:
+; CHECK: %unrelated.cond.1 = icmp eq i32 %x1, %i
+; CHECK-NEXT: %unrelated.cond.2 = icmp eq i32 %x2, %i
+; CHECK-NEXT: %unrelated.cond.3 = icmp eq i32 %x3, %i
+; CHECK-NEXT: %unrelated.cond.and.1 = and i1 %unrelated.cond.1, %unrelated.cond.2
+; CHECK-NEXT: %guard.cond = and i1 %unrelated.cond.and.1, %unrelated.cond.3
+; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %guard.cond, i32 9) [ "deopt"() ]
+  %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %loop ], [ 0, %loop.preheader ]
+  %unrelated.cond.1 = icmp eq i32 %x1, %i
+  %unrelated.cond.2 = icmp eq i32 %x2, %i
+  %unrelated.cond.3 = icmp eq i32 %x3, %i
+  %unrelated.cond.and.1 = and i1 %unrelated.cond.1, %unrelated.cond.2
+  %guard.cond = and i1 %unrelated.cond.and.1, %unrelated.cond.3
+
+  call void (i1, ...) @llvm.experimental.guard(i1 %guard.cond, i32 9) [ "deopt"() ]
+
+  %i.i64 = zext i32 %i to i64
+  %array.i.ptr = getelementptr inbounds i32, i32* %array, i64 %i.i64
+  %array.i = load i32, i32* %array.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc, %array.i
+
+  %i.next = add nuw i32 %i, 1
+  %continue = icmp ult i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %loop.acc.next, %loop ]
+  ret i32 %result
+}
+
+define i32 @signed_loop_start_to_n_loop_variant_bound(i32* %array, i32 %x, i32 %start, i32 %n) {
+; CHECK-LABEL: @signed_loop_start_to_n_loop_variant_bound
+entry:
+  %tmp5 = icmp sle i32 %n, 0
+  br i1 %tmp5, label %exit, label %loop.preheader
+
+loop.preheader:
+; CHECK: loop.preheader:
+; CHECK-NEXT: br label %loop
+  br label %loop
+
+loop:
+; CHECK: loop:
+; CHECK: %bound = add i32 %i, %x
+; CHECK-NEXT: %within.bounds = icmp slt i32 %i, %bound
+; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
+  %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %loop ], [ %start, %loop.preheader ]
+  %bound = add i32 %i, %x
+  %within.bounds = icmp slt i32 %i, %bound
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
+
+  %i.i64 = zext i32 %i to i64
+  %array.i.ptr = getelementptr inbounds i32, i32* %array, i64 %i.i64
+  %array.i = load i32, i32* %array.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc, %array.i
+
+  %i.next = add nsw i32 %i, 1
+  %continue = icmp slt i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %loop.acc.next, %loop ]
+  ret i32 %result
+}
+
+define i32 @signed_loop_start_to_n_non_monotonic_predicate(i32* %array, i32 %x, i32 %start, i32 %n) {
+; CHECK-LABEL: @signed_loop_start_to_n_non_monotonic_predicate
+entry:
+  %tmp5 = icmp sle i32 %n, 0
+  br i1 %tmp5, label %exit, label %loop.preheader
+
+loop.preheader:
+; CHECK: loop.preheader:
+; CHECK-NEXT: br label %loop
+  br label %loop
+
+loop:
+; CHECK: loop:
+; CHECK: %guard.cond = icmp eq i32 %i, %x
+; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %guard.cond, i32 9) [ "deopt"() ]
+  %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %loop ], [ %start, %loop.preheader ]
+  %guard.cond = icmp eq i32 %i, %x
+  call void (i1, ...) @llvm.experimental.guard(i1 %guard.cond, i32 9) [ "deopt"() ]
+
+  %i.i64 = zext i32 %i to i64
+  %array.i.ptr = getelementptr inbounds i32, i32* %array, i64 %i.i64
+  %array.i = load i32, i32* %array.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc, %array.i
+
+  %i.next = add nsw i32 %i, 1
+  %continue = icmp slt i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %loop.acc.next, %loop ]
+  ret i32 %result
+}
+
+define i32 @unsigned_loop_0_to_n_hoist_length(i32* %array, i16 %length.i16, i32 %n) {
+; CHECK-LABEL: @unsigned_loop_0_to_n_hoist_length
+entry:
+  %tmp5 = icmp eq i32 %n, 0
+  br i1 %tmp5, label %exit, label %loop.preheader
+
+loop.preheader:
+; CHECK: loop.preheader:
+; CHECK: [[max_index:[^ ]+]] = add i32 %n, -1
+; CHECK-NEXT: [[length:[^ ]+]] = zext i16 %length.i16 to i32
+; CHECK-NEXT: [[wide_cond:[^ ]+]] = icmp ult i32 [[max_index]], [[length]]
+; CHECK-NEXT: br label %loop
+  br label %loop
+
+loop:
+; CHECK: loop:
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond]], i32 9) [ "deopt"() ]
+  %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %loop ], [ 0, %loop.preheader ]
+  %length = zext i16 %length.i16 to i32
+  %within.bounds = icmp ult i32 %i, %length
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
+
+  %i.i64 = zext i32 %i to i64
+  %array.i.ptr = getelementptr inbounds i32, i32* %array, i64 %i.i64
+  %array.i = load i32, i32* %array.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc, %array.i
+
+  %i.next = add nuw i32 %i, 1
+  %continue = icmp ult i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %loop.acc.next, %loop ]
+  ret i32 %result
+}
+
+define i32 @unsigned_loop_0_to_n_cant_hoist_length(i32* %array, i32 %length, i32 %divider, i32 %n) {
+; CHECK-LABEL: @unsigned_loop_0_to_n_cant_hoist_length
+entry:
+  %tmp5 = icmp eq i32 %n, 0
+  br i1 %tmp5, label %exit, label %loop.preheader
+
+loop.preheader:
+; CHECK: loop.preheader:
+; CHECK-NEXT: br label %loop
+  br label %loop
+
+loop:
+; CHECK: loop:
+; CHECK-NEXT: %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
+; CHECK-NEXT: %i = phi i32 [ %i.next, %loop ], [ 0, %loop.preheader ]
+; CHECK-NEXT: %length.udiv = udiv i32 %length, %divider
+; CHECK-NEXT: %within.bounds = icmp ult i32 %i, %length.udiv
+; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
+  %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %loop ], [ 0, %loop.preheader ]
+  %length.udiv = udiv i32 %length, %divider
+  %within.bounds = icmp ult i32 %i, %length.udiv
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
+
+  %i.i64 = zext i32 %i to i64
+  %array.i.ptr = getelementptr inbounds i32, i32* %array, i64 %i.i64
+  %array.i = load i32, i32* %array.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc, %array.i
+
+  %i.next = add nuw i32 %i, 1
+  %continue = icmp ult i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %loop.acc.next, %loop ]
+  ret i32 %result
+}
diff --git a/test/Transforms/LoopPredication/nested.ll b/test/Transforms/LoopPredication/nested.ll
new file mode 100644
index 000000000000..6b40cde3e575
--- /dev/null
+++ b/test/Transforms/LoopPredication/nested.ll
@@ -0,0 +1,160 @@
+; RUN: opt -S -loop-predication < %s 2>&1 | FileCheck %s
+; RUN: opt -S -passes='require<scalar-evolution>,loop(loop-predication)' < %s 2>&1 | FileCheck %s
+
+declare void @llvm.experimental.guard(i1, ...)
+
+define i32 @signed_loop_0_to_n_nested_0_to_l_inner_index_check(i32* %array, i32 %length, i32 %n, i32 %l) {
+; CHECK-LABEL: @signed_loop_0_to_n_nested_0_to_l_inner_index_check
+entry:
+  %tmp5 = icmp sle i32 %n, 0
+  br i1 %tmp5, label %exit, label %outer.loop.preheader
+
+outer.loop.preheader:
+; CHECK: outer.loop.preheader:
+; CHECK: [[iteration_count:[^ ]+]] = add i32 %l, -1
+  br label %outer.loop
+
+outer.loop:
+  %outer.loop.acc = phi i32 [ %outer.loop.acc.next, %outer.loop.inc ], [ 0, %outer.loop.preheader ]
+  %i = phi i32 [ %i.next, %outer.loop.inc ], [ 0, %outer.loop.preheader ]
+  %tmp6 = icmp sle i32 %l, 0
+  br i1 %tmp6, label %outer.loop.inc, label %inner.loop.preheader
+  
+inner.loop.preheader:
+; CHECK: inner.loop.preheader:
+; CHECK: [[wide_cond:[^ ]+]] = icmp slt i32 [[iteration_count]], %length
+  br label %inner.loop
+
+inner.loop:
+; CHECK: inner.loop:
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond]], i32 9) [ "deopt"() ]
+  %inner.loop.acc = phi i32 [ %inner.loop.acc.next, %inner.loop ], [ %outer.loop.acc, %inner.loop.preheader ]
+  %j = phi i32 [ %j.next, %inner.loop ], [ 0, %inner.loop.preheader ]
+
+  %within.bounds = icmp slt i32 %j, %length
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
+  
+  %j.i64 = zext i32 %j to i64
+  %array.j.ptr = getelementptr inbounds i32, i32* %array, i64 %j.i64
+  %array.j = load i32, i32* %array.j.ptr, align 4
+  %inner.loop.acc.next = add i32 %inner.loop.acc, %array.j
+
+  %j.next = add nsw i32 %j, 1
+  %inner.continue = icmp slt i32 %j.next, %l
+  br i1 %inner.continue, label %inner.loop, label %outer.loop.inc
+
+outer.loop.inc:
+  %outer.loop.acc.next = phi i32 [ %inner.loop.acc.next, %inner.loop ], [ %outer.loop.acc, %outer.loop ]
+  %i.next = add nsw i32 %i, 1
+  %outer.continue = icmp slt i32 %i.next, %n
+  br i1 %outer.continue, label %outer.loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %outer.loop.acc.next, %outer.loop.inc ]
+  ret i32 %result
+}
+
+define i32 @signed_loop_0_to_n_nested_0_to_l_outer_index_check(i32* %array, i32 %length, i32 %n, i32 %l) {
+; CHECK-LABEL: @signed_loop_0_to_n_nested_0_to_l_outer_index_check
+entry:
+  %tmp5 = icmp sle i32 %n, 0
+  br i1 %tmp5, label %exit, label %outer.loop.preheader
+
+outer.loop.preheader:
+; CHECK: outer.loop.preheader:
+; CHECK: [[iteration_count:[^ ]+]] = add i32 %n, -1
+; CHECK: [[wide_cond:[^ ]+]] = icmp slt i32 [[iteration_count]], %length
+  br label %outer.loop
+
+outer.loop:
+  %outer.loop.acc = phi i32 [ %outer.loop.acc.next, %outer.loop.inc ], [ 0, %outer.loop.preheader ]
+  %i = phi i32 [ %i.next, %outer.loop.inc ], [ 0, %outer.loop.preheader ]
+  %tmp6 = icmp sle i32 %l, 0
+  br i1 %tmp6, label %outer.loop.inc, label %inner.loop.preheader
+  
+inner.loop.preheader:
+  br label %inner.loop
+
+inner.loop:
+; CHECK: inner.loop:
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond]], i32 9) [ "deopt"() ]
+
+  %inner.loop.acc = phi i32 [ %inner.loop.acc.next, %inner.loop ], [ %outer.loop.acc, %inner.loop.preheader ]
+  %j = phi i32 [ %j.next, %inner.loop ], [ 0, %inner.loop.preheader ]
+
+  %within.bounds = icmp slt i32 %i, %length
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
+  
+  %i.i64 = zext i32 %i to i64
+  %array.i.ptr = getelementptr inbounds i32, i32* %array, i64 %i.i64
+  %array.i = load i32, i32* %array.i.ptr, align 4
+  %inner.loop.acc.next = add i32 %inner.loop.acc, %array.i
+
+  %j.next = add nsw i32 %j, 1
+  %inner.continue = icmp slt i32 %j.next, %l
+  br i1 %inner.continue, label %inner.loop, label %outer.loop.inc
+
+outer.loop.inc:
+  %outer.loop.acc.next = phi i32 [ %inner.loop.acc.next, %inner.loop ], [ %outer.loop.acc, %outer.loop ]
+  %i.next = add nsw i32 %i, 1
+  %outer.continue = icmp slt i32 %i.next, %n
+  br i1 %outer.continue, label %outer.loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %outer.loop.acc.next, %outer.loop.inc ]
+  ret i32 %result
+}
+
+define i32 @signed_loop_0_to_n_nested_i_to_l_inner_index_check(i32* %array, i32 %length, i32 %n, i32 %l) {
+; CHECK-LABEL: @signed_loop_0_to_n_nested_i_to_l_inner_index_check
+entry:
+  %tmp5 = icmp sle i32 %n, 0
+  br i1 %tmp5, label %exit, label %outer.loop.preheader
+
+outer.loop.preheader:
+  br label %outer.loop
+
+outer.loop:
+; CHECK: outer.loop:
+; CHECK: [[i_1:[^ ]+]] = add i32 %i, 1
+; CHECK-NEXT: [[l_sgt_i_1:[^ ]+]] = icmp sgt i32 %l, [[i_1]]
+; CHECK-NEXT: [[smax:[^ ]+]] = select i1 [[l_sgt_i_1]], i32 %l, i32 [[i_1]]
+; CHECK-NEXT: [[max_j:[^ ]+]] = add i32 [[smax]], -1
+  %outer.loop.acc = phi i32 [ %outer.loop.acc.next, %outer.loop.inc ], [ 0, %outer.loop.preheader ]
+  %i = phi i32 [ %i.next, %outer.loop.inc ], [ 0, %outer.loop.preheader ]
+  %tmp6 = icmp sle i32 %l, 0
+  br i1 %tmp6, label %outer.loop.inc, label %inner.loop.preheader
+  
+inner.loop.preheader:
+; CHECK: inner.loop.preheader:
+; CHECK: [[wide_cond:[^ ]+]] = icmp slt i32 [[max_j]], %length
+  br label %inner.loop
+
+inner.loop:
+; CHECK: inner.loop:
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond]], i32 9) [ "deopt"() ]
+  %inner.loop.acc = phi i32 [ %inner.loop.acc.next, %inner.loop ], [ %outer.loop.acc, %inner.loop.preheader ]
+  %j = phi i32 [ %j.next, %inner.loop ], [ %i, %inner.loop.preheader ]
+
+  %within.bounds = icmp slt i32 %j, %length
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
+  
+  %j.i64 = zext i32 %j to i64
+  %array.j.ptr = getelementptr inbounds i32, i32* %array, i64 %j.i64
+  %array.j = load i32, i32* %array.j.ptr, align 4
+  %inner.loop.acc.next = add i32 %inner.loop.acc, %array.j
+
+  %j.next = add nsw i32 %j, 1
+  %inner.continue = icmp slt i32 %j.next, %l
+  br i1 %inner.continue, label %inner.loop, label %outer.loop.inc
+
+outer.loop.inc:
+  %outer.loop.acc.next = phi i32 [ %inner.loop.acc.next, %inner.loop ], [ %outer.loop.acc, %outer.loop ]
+  %i.next = add nsw i32 %i, 1
+  %outer.continue = icmp slt i32 %i.next, %n
+  br i1 %outer.continue, label %outer.loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %outer.loop.acc.next, %outer.loop.inc ]
+  ret i32 %result
+}
+\ No newline at end of file
diff --git a/test/Transforms/LoopPredication/visited.ll b/test/Transforms/LoopPredication/visited.ll
new file mode 100644
index 000000000000..e9aae77f8e6f
--- /dev/null
+++ b/test/Transforms/LoopPredication/visited.ll
@@ -0,0 +1,140 @@
+; RUN: opt -S -loop-predication < %s 2>&1 | FileCheck %s
+; RUN: opt -S -passes='require<scalar-evolution>,loop(loop-predication)' < %s 2>&1 | FileCheck %s
+
+declare void @llvm.experimental.guard(i1, ...)
+
+define i32 @test_visited(i32* %array, i32 %length, i32 %n, i32 %x) {
+; CHECK-LABEL: @test_visited
+entry:
+  %tmp5 = icmp eq i32 %n, 0
+  br i1 %tmp5, label %exit, label %loop.preheader
+
+loop.preheader:
+; CHECK: loop.preheader:
+; CHECK: [[iteration_count:[^ ]+]] = add i32 %n, -1
+; CHECK-NEXT: [[wide_cond:[^ ]+]] = icmp ult i32 [[iteration_count]], %length
+; CHECK-NEXT: br label %loop
+  br label %loop
+
+loop:
+; CHECK: loop:
+; CHECK: %unrelated.cond = icmp eq i32 %x, %i
+; CHECK: [[guard_cond:[^ ]+]] = and i1 %unrelated.cond, [[wide_cond]]
+; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[guard_cond]], i32 9) [ "deopt"() ]
+  %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %loop ], [ 0, %loop.preheader ]
+  %within.bounds = icmp ult i32 %i, %length
+  %unrelated.cond = icmp eq i32 %x, %i
+  %guard.cond.2 = and i1 %within.bounds, %unrelated.cond
+  %guard.cond.3 = and i1 %guard.cond.2, %unrelated.cond
+  %guard.cond.4 = and i1 %guard.cond.3, %guard.cond.2
+  %guard.cond.5 = and i1 %guard.cond.4, %guard.cond.3
+  %guard.cond.6 = and i1 %guard.cond.5, %guard.cond.4
+  %guard.cond.7 = and i1 %guard.cond.6, %guard.cond.5
+  %guard.cond.8 = and i1 %guard.cond.7, %guard.cond.6
+  %guard.cond.9 = and i1 %guard.cond.8, %guard.cond.7
+  %guard.cond.10 = and i1 %guard.cond.9, %guard.cond.8
+  %guard.cond.11 = and i1 %guard.cond.10, %guard.cond.9
+  %guard.cond.12 = and i1 %guard.cond.11, %guard.cond.10
+  %guard.cond.13 = and i1 %guard.cond.12, %guard.cond.11
+  %guard.cond.14 = and i1 %guard.cond.13, %guard.cond.12
+  %guard.cond.15 = and i1 %guard.cond.14, %guard.cond.13
+  %guard.cond.16 = and i1 %guard.cond.15, %guard.cond.14
+  %guard.cond.17 = and i1 %guard.cond.16, %guard.cond.15
+  %guard.cond.18 = and i1 %guard.cond.17, %guard.cond.16
+  %guard.cond.19 = and i1 %guard.cond.18, %guard.cond.17
+  %guard.cond.20 = and i1 %guard.cond.19, %guard.cond.18
+  %guard.cond.21 = and i1 %guard.cond.20, %guard.cond.19
+  %guard.cond.22 = and i1 %guard.cond.21, %guard.cond.20
+  %guard.cond.23 = and i1 %guard.cond.22, %guard.cond.21
+  %guard.cond.24 = and i1 %guard.cond.23, %guard.cond.22
+  %guard.cond.25 = and i1 %guard.cond.24, %guard.cond.23
+  %guard.cond.26 = and i1 %guard.cond.25, %guard.cond.24
+  %guard.cond.27 = and i1 %guard.cond.26, %guard.cond.25
+  %guard.cond.28 = and i1 %guard.cond.27, %guard.cond.26
+  %guard.cond.29 = and i1 %guard.cond.28, %guard.cond.27
+  %guard.cond.30 = and i1 %guard.cond.29, %guard.cond.28
+  %guard.cond.31 = and i1 %guard.cond.30, %guard.cond.29
+  %guard.cond.32 = and i1 %guard.cond.31, %guard.cond.30
+  %guard.cond.33 = and i1 %guard.cond.32, %guard.cond.31
+  %guard.cond.34 = and i1 %guard.cond.33, %guard.cond.32
+  %guard.cond.35 = and i1 %guard.cond.34, %guard.cond.33
+  %guard.cond.36 = and i1 %guard.cond.35, %guard.cond.34
+  %guard.cond.37 = and i1 %guard.cond.36, %guard.cond.35
+  %guard.cond.38 = and i1 %guard.cond.37, %guard.cond.36
+  %guard.cond.39 = and i1 %guard.cond.38, %guard.cond.37
+  %guard.cond.40 = and i1 %guard.cond.39, %guard.cond.38
+  %guard.cond.41 = and i1 %guard.cond.40, %guard.cond.39
+  %guard.cond.42 = and i1 %guard.cond.41, %guard.cond.40
+  %guard.cond.43 = and i1 %guard.cond.42, %guard.cond.41
+  %guard.cond.44 = and i1 %guard.cond.43, %guard.cond.42
+  %guard.cond.45 = and i1 %guard.cond.44, %guard.cond.43
+  %guard.cond.46 = and i1 %guard.cond.45, %guard.cond.44
+  %guard.cond.47 = and i1 %guard.cond.46, %guard.cond.45
+  %guard.cond.48 = and i1 %guard.cond.47, %guard.cond.46
+  %guard.cond.49 = and i1 %guard.cond.48, %guard.cond.47
+  %guard.cond.50 = and i1 %guard.cond.49, %guard.cond.48
+  %guard.cond.51 = and i1 %guard.cond.50, %guard.cond.49
+  %guard.cond.52 = and i1 %guard.cond.51, %guard.cond.50
+  %guard.cond.53 = and i1 %guard.cond.52, %guard.cond.51
+  %guard.cond.54 = and i1 %guard.cond.53, %guard.cond.52
+  %guard.cond.55 = and i1 %guard.cond.54, %guard.cond.53
+  %guard.cond.56 = and i1 %guard.cond.55, %guard.cond.54
+  %guard.cond.57 = and i1 %guard.cond.56, %guard.cond.55
+  %guard.cond.58 = and i1 %guard.cond.57, %guard.cond.56
+  %guard.cond.59 = and i1 %guard.cond.58, %guard.cond.57
+  %guard.cond.60 = and i1 %guard.cond.59, %guard.cond.58
+  %guard.cond.61 = and i1 %guard.cond.60, %guard.cond.59
+  %guard.cond.62 = and i1 %guard.cond.61, %guard.cond.60
+  %guard.cond.63 = and i1 %guard.cond.62, %guard.cond.61
+  %guard.cond.64 = and i1 %guard.cond.63, %guard.cond.62
+  %guard.cond.65 = and i1 %guard.cond.64, %guard.cond.63
+  %guard.cond.66 = and i1 %guard.cond.65, %guard.cond.64
+  %guard.cond.67 = and i1 %guard.cond.66, %guard.cond.65
+  %guard.cond.68 = and i1 %guard.cond.67, %guard.cond.66
+  %guard.cond.69 = and i1 %guard.cond.68, %guard.cond.67
+  %guard.cond.70 = and i1 %guard.cond.69, %guard.cond.68
+  %guard.cond.71 = and i1 %guard.cond.70, %guard.cond.69
+  %guard.cond.72 = and i1 %guard.cond.71, %guard.cond.70
+  %guard.cond.73 = and i1 %guard.cond.72, %guard.cond.71
+  %guard.cond.74 = and i1 %guard.cond.73, %guard.cond.72
+  %guard.cond.75 = and i1 %guard.cond.74, %guard.cond.73
+  %guard.cond.76 = and i1 %guard.cond.75, %guard.cond.74
+  %guard.cond.77 = and i1 %guard.cond.76, %guard.cond.75
+  %guard.cond.78 = and i1 %guard.cond.77, %guard.cond.76
+  %guard.cond.79 = and i1 %guard.cond.78, %guard.cond.77
+  %guard.cond.80 = and i1 %guard.cond.79, %guard.cond.78
+  %guard.cond.81 = and i1 %guard.cond.80, %guard.cond.79
+  %guard.cond.82 = and i1 %guard.cond.81, %guard.cond.80
+  %guard.cond.83 = and i1 %guard.cond.82, %guard.cond.81
+  %guard.cond.84 = and i1 %guard.cond.83, %guard.cond.82
+  %guard.cond.85 = and i1 %guard.cond.84, %guard.cond.83
+  %guard.cond.86 = and i1 %guard.cond.85, %guard.cond.84
+  %guard.cond.87 = and i1 %guard.cond.86, %guard.cond.85
+  %guard.cond.88 = and i1 %guard.cond.87, %guard.cond.86
+  %guard.cond.89 = and i1 %guard.cond.88, %guard.cond.87
+  %guard.cond.90 = and i1 %guard.cond.89, %guard.cond.88
+  %guard.cond.91 = and i1 %guard.cond.90, %guard.cond.89
+  %guard.cond.92 = and i1 %guard.cond.91, %guard.cond.90
+  %guard.cond.93 = and i1 %guard.cond.92, %guard.cond.91
+  %guard.cond.94 = and i1 %guard.cond.93, %guard.cond.92
+  %guard.cond.95 = and i1 %guard.cond.94, %guard.cond.93
+  %guard.cond.96 = and i1 %guard.cond.95, %guard.cond.94
+  %guard.cond.97 = and i1 %guard.cond.96, %guard.cond.95
+  %guard.cond.98 = and i1 %guard.cond.97, %guard.cond.96
+  %guard.cond.99 = and i1 %guard.cond.98, %guard.cond.97
+  call void (i1, ...) @llvm.experimental.guard(i1 %guard.cond.99, i32 9) [ "deopt"() ]
+
+  %i.i64 = zext i32 %i to i64
+  %array.i.ptr = getelementptr inbounds i32, i32* %array, i64 %i.i64
+  %array.i = load i32, i32* %array.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc, %array.i
+
+  %i.next = add nuw i32 %i, 1
+  %continue = icmp ult i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %loop.acc.next, %loop ]
+  ret i32 %result
+}
+\ No newline at end of file
diff --git a/test/Transforms/LoopRotate/phi-dbgvalue.ll b/test/Transforms/LoopRotate/phi-dbgvalue.ll
new file mode 100644
index 000000000000..aa8ca2f627bd
--- /dev/null
+++ b/test/Transforms/LoopRotate/phi-dbgvalue.ll
@@ -0,0 +1,79 @@
+; RUN: opt -S -loop-rotate < %s | FileCheck %s
+
+;CHECK-LABEL: func
+;CHECK-LABEL: entry
+;CHECK-NEXT: tail call void @llvm.dbg.value(metadata i32 %a
+;CHECK-NEXT: tail call void @llvm.dbg.value(metadata i32 1, i64 0, metadata !13, metadata !11), !dbg !15
+;CHECK-LABEL: for.body:
+;CHECK-NEXT: [[I:%.*]] = phi i32 [ 1, %entry ], [ %inc, %for.body ]
+;CHECK-NEXT: tail call void @llvm.dbg.value(metadata i32 [[I]], i64 0, metadata !13, metadata !11), !dbg !15
+
+; Function Attrs: noinline nounwind
+define void @func(i32 %a) local_unnamed_addr #0 !dbg !6 {
+entry:
+  tail call void @llvm.dbg.value(metadata i32 %a, i64 0, metadata !10, metadata !11), !dbg !12
+  tail call void @llvm.dbg.value(metadata i32 1, i64 0, metadata !13, metadata !11), !dbg !15
+  br label %for.cond, !dbg !16
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 1, %entry ], [ %inc, %for.body ]
+  tail call void @llvm.dbg.value(metadata i32 %i.0, i64 0, metadata !13, metadata !11), !dbg !15
+  %cmp = icmp slt i32 %i.0, 10, !dbg !17
+  br i1 %cmp, label %for.body, label %for.end, !dbg !20
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %i.0, %a, !dbg !22
+  %call = tail call i32 @func2(i32 %i.0, i32 %add) #3, !dbg !24
+  %inc = add nsw i32 %i.0, 1, !dbg !25
+  tail call void @llvm.dbg.value(metadata i32 %inc, i64 0, metadata !13, metadata !11), !dbg !15
+  br label %for.cond, !dbg !27, !llvm.loop !28
+
+for.end:                                          ; preds = %for.cond
+  ret void, !dbg !31
+}
+
+declare i32 @func2(i32, i32) local_unnamed_addr
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
+
+attributes #0 = { noinline nounwind }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 (http://llvm.org/git/clang.git 0f3ed908c1f13f83da4b240f7595eb8d05e0a754) (http://llvm.org/git/llvm.git 8e270f5a6b8ceb0f3ac3ef1ffb83c5e29b44ae68)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "debug-phi.c", directory: "/work/projects/src/tests/debug")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 5.0.0 (http://llvm.org/git/clang.git 0f3ed908c1f13f83da4b240f7595eb8d05e0a754) (http://llvm.org/git/llvm.git 8e270f5a6b8ceb0f3ac3ef1ffb83c5e29b44ae68)"}
+!6 = distinct !DISubprogram(name: "func", scope: !1, file: !1, line: 2, type: !7, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !8)
+!8 = !{null, !9}
+!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!10 = !DILocalVariable(name: "a", arg: 1, scope: !6, file: !1, line: 2, type: !9)
+!11 = !DIExpression()
+!12 = !DILocation(line: 2, column: 15, scope: !6)
+!13 = !DILocalVariable(name: "i", scope: !14, file: !1, line: 3, type: !9)
+!14 = distinct !DILexicalBlock(scope: !6, file: !1, line: 3, column: 3)
+!15 = !DILocation(line: 3, column: 11, scope: !14)
+!16 = !DILocation(line: 3, column: 7, scope: !14)
+!17 = !DILocation(line: 3, column: 20, scope: !18)
+!18 = !DILexicalBlockFile(scope: !19, file: !1, discriminator: 1)
+!19 = distinct !DILexicalBlock(scope: !14, file: !1, line: 3, column: 3)
+!20 = !DILocation(line: 3, column: 3, scope: !21)
+!21 = !DILexicalBlockFile(scope: !14, file: !1, discriminator: 1)
+!22 = !DILocation(line: 4, column: 15, scope: !23)
+!23 = distinct !DILexicalBlock(scope: !19, file: !1, line: 3, column: 31)
+!24 = !DILocation(line: 4, column: 5, scope: !23)
+!25 = !DILocation(line: 3, column: 27, scope: !26)
+!26 = !DILexicalBlockFile(scope: !19, file: !1, discriminator: 2)
+!27 = !DILocation(line: 3, column: 3, scope: !26)
+!28 = distinct !{!28, !29, !30}
+!29 = !DILocation(line: 3, column: 3, scope: !14)
+!30 = !DILocation(line: 5, column: 3, scope: !14)
+!31 = !DILocation(line: 6, column: 1, scope: !6)
diff --git a/test/Transforms/LoopSimplify/dbg-loc.ll b/test/Transforms/LoopSimplify/dbg-loc.ll
index 702a1ad16af6..98bfefd12238 100644
--- a/test/Transforms/LoopSimplify/dbg-loc.ll
+++ b/test/Transforms/LoopSimplify/dbg-loc.ll
@@ -23,6 +23,7 @@ entry:
 
 for.body:                                         ; preds = %entry, %length.exit
   %begin.sink5 = phi %"Length"* [ %incdec.ptr, %length.exit ], [ %begin, %entry ]
+  tail call void @llvm.dbg.value(metadata %"Length"* %begin.sink5, i64 0, metadata !15, metadata !16), !dbg !17
   %m_type.i.i.i = getelementptr inbounds %"Length", %"Length"* %begin.sink5, i64 0, i32 2, !dbg !9
   %0 = load i8, i8* %m_type.i.i.i, align 1, !dbg !9
   %cmp.i.i = icmp eq i8 %0, 9, !dbg !7
@@ -68,6 +69,9 @@ eh.resume:                                        ; preds = %catch
   resume { i8*, i32 } undef, !dbg !13
 }
 
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
 ; CHECK-DAG: [[PREHEADER_LOC]] = !DILocation(line: 73, column: 27, scope: !{{[0-9]+}})
 ; CHECK-DAG: [[LOOPEXIT_LOC]] = !DILocation(line: 75, column: 9, scope: !{{[0-9]+}})
 ; CHECK-DAG: [[LPAD_PREHEADER_LOC]] = !DILocation(line: 85, column: 1, scope: !{{[0-9]+}})
@@ -93,3 +97,6 @@ eh.resume:                                        ; preds = %catch
                              file: !5,
                              isOptimized: true, flags: "-O2",
                              splitDebugFilename: "abc.debug", emissionKind: 2)
+!15 = !DILocalVariable(name: "begin", arg: 1, scope: !6, file: !5, line: 71)
+!16 = !DIExpression()
+!17 = !DILocation(line: 71, column: 32, scope: !6)
diff --git a/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll b/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll
new file mode 100644
index 000000000000..054c61d18795
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll
@@ -0,0 +1,87 @@
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=bonaire -loop-reduce < %s | FileCheck -check-prefix=OPT %s
+
+target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+
+; Make sure the pointer / address space of AtomicRMW is considered
+
+; OPT-LABEL: @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32(
+
+; OPT-NOT: getelementptr
+
+; OPT: .lr.ph:
+; OPT: %lsr.iv2 = phi i32 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
+; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ]
+; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ]
+; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv2, i32 16383
+; OPT: %tmp4 = atomicrmw add i32 addrspace(3)* %scevgep4, i32 undef seq_cst
+; OPT: %tmp7 = atomicrmw add i32 addrspace(3)* %lsr.iv1, i32 undef seq_cst
+; OPT: %0 = atomicrmw add i32 addrspace(3)* %lsr.iv1, i32 %tmp8 seq_cst
+; OPT: br i1 %exitcond
+define amdgpu_kernel void @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+bb:
+  %tmp = icmp sgt i32 %n, 0
+  br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
+
+.lr.ph.preheader:                                 ; preds = %bb
+  br label %.lr.ph
+
+._crit_edge.loopexit:                             ; preds = %.lr.ph
+  br label %._crit_edge
+
+._crit_edge:                                      ; preds = %._crit_edge.loopexit, %bb
+  ret void
+
+.lr.ph:                                           ; preds = %.lr.ph, %.lr.ph.preheader
+  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+  %tmp1 = add nuw nsw i32 %indvars.iv, 16383
+  %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1
+  %tmp4 = atomicrmw add i32 addrspace(3)* %tmp3, i32 undef seq_cst
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv
+  %tmp7 = atomicrmw add i32 addrspace(3)* %tmp6, i32 undef seq_cst
+  %tmp8 = add nsw i32 %tmp7, %tmp4
+  atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, %n
+  br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
+}
+
+; OPT-LABEL: test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32(
+; OPT-NOT: getelementptr
+
+; OPT: .lr.ph:
+; OPT: %lsr.iv2 = phi i32 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
+; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ]
+; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ]
+; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv2, i32 16383
+; OPT: %tmp4 = cmpxchg i32 addrspace(3)* %scevgep4, i32 undef, i32 undef seq_cst monotonic
+define amdgpu_kernel void @test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+bb:
+  %tmp = icmp sgt i32 %n, 0
+  br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
+
+.lr.ph.preheader:                                 ; preds = %bb
+  br label %.lr.ph
+
+._crit_edge.loopexit:                             ; preds = %.lr.ph
+  br label %._crit_edge
+
+._crit_edge:                                      ; preds = %._crit_edge.loopexit, %bb
+  ret void
+
+.lr.ph:                                           ; preds = %.lr.ph, %.lr.ph.preheader
+  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+  %tmp1 = add nuw nsw i32 %indvars.iv, 16383
+  %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1
+  %tmp4 = cmpxchg i32 addrspace(3)* %tmp3, i32 undef, i32 undef seq_cst monotonic
+  %tmp4.0 = extractvalue { i32, i1 } %tmp4, 0
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv
+  %tmp7 = cmpxchg i32 addrspace(3)* %tmp6, i32 undef, i32 undef seq_cst monotonic
+  %tmp7.0 = extractvalue { i32, i1 } %tmp7, 0
+  %tmp8 = add nsw i32 %tmp7.0, %tmp4.0
+  atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, %n
+  br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
+}
+
+attributes #0 = { nounwind }
+\ No newline at end of file
diff --git a/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll b/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
index bf61112a3c3e..c5ea1b915d91 100644
--- a/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
+++ b/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
@@ -10,7 +10,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; OPT: %lsr.iv2 = phi i8 addrspace(1)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(1)* %lsr.iv2, i64 4095
 ; OPT: load i8, i8 addrspace(1)* %scevgep4, align 1
-define void @test_global_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
@@ -48,7 +48,7 @@ bb:
 ; OPT: {{^}}.lr.ph:
 ; OPT: %lsr.iv3 = phi i8 addrspace(1)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(1)* %lsr.iv3, i64 1
-define void @test_global_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
@@ -83,7 +83,7 @@ bb:
 ; OPT: %lsr.iv2 = phi i8 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(3)* %lsr.iv2, i32 65535
 ; OPT: %tmp4 = load i8, i8 addrspace(3)* %scevgep4, align 1
-define void @test_local_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
@@ -122,7 +122,7 @@ bb:
 ; OPT: {{^}}.lr.ph:
 ; OPT: %lsr.iv3 = phi i8 addrspace(3)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(3)* %lsr.iv3, i32 1
-define void @test_local_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
diff --git a/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll b/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll
new file mode 100644
index 000000000000..02c3c05e7945
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+target triple = "amdgcn--"
+
+; We need to compile this for a target where we have different address spaces,
+; and where pointers in those address spaces have different size.
+; E.g. for amdgcn-- pointers in address space 0 are 32 bits and pointers in
+; address space 1 are 64 bits.
+
+; We shouldn't crash. Check that we get a loop with the two stores.
+;CHECK-LABEL: foo:
+;CHECK: [[LOOP_LABEL:BB[0-9]+_[0-9]+]]:
+;CHECK: buffer_store_dword
+;CHECK: buffer_store_dword
+;CHECK: s_branch [[LOOP_LABEL]]
+
+define amdgpu_kernel void @foo() {
+entry:
+  br label %loop
+
+loop:
+  %idx0 = phi i32 [ %next_idx0, %loop ], [ 0, %entry ]
+  %0 = getelementptr inbounds i32, i32* null, i32 %idx0
+  %1 = getelementptr inbounds i32, i32 addrspace(1)* null, i32 %idx0
+  store i32 1, i32* %0
+  store i32 7, i32 addrspace(1)* %1
+  %next_idx0 = add nuw nsw i32 %idx0, 1
+  br label %loop
+}
+
diff --git a/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll b/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll
index 8c83df5843d2..67b1926bdf27 100644
--- a/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll
+++ b/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll
@@ -16,7 +16,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; CHECK: bb:
 ; CHECK: inttoptr i32 %lsr.iv.next2 to i8 addrspace(3)*
 ; CHECK: %c1 = icmp ne i8 addrspace(3)*
-define void @local_cmp_user(i32 %arg0) nounwind {
+define amdgpu_kernel void @local_cmp_user(i32 %arg0) nounwind {
 entry:
   br label %bb11
 
@@ -47,7 +47,7 @@ bb13:
 ; CHECK: bb:
 ; CHECK: inttoptr i64 %lsr.iv.next2 to i8 addrspace(1)*
 ; CHECK: icmp ne i8 addrspace(1)* %t
-define void @global_cmp_user(i64 %arg0) nounwind {
+define amdgpu_kernel void @global_cmp_user(i64 %arg0) nounwind {
 entry:
   br label %bb11
 
@@ -78,7 +78,7 @@ bb13:
 ; CHECK: bb:
 ; CHECK: %idxprom = sext i32 %lsr.iv1 to i64
 ; CHECK: getelementptr i8, i8 addrspace(1)* %t, i64 %idxprom
-define void @global_gep_user(i32 %arg0) nounwind {
+define amdgpu_kernel void @global_gep_user(i32 %arg0) nounwind {
 entry:
   br label %bb11
 
@@ -108,7 +108,7 @@ bb13:
 
 ; CHECK: bb
 ; CHECK: %p = getelementptr i8, i8 addrspace(1)* %t, i64 %ii.ext
-define void @global_sext_scale_user(i32 %arg0) nounwind {
+define amdgpu_kernel void @global_sext_scale_user(i32 %arg0) nounwind {
 entry:
   br label %bb11
 
diff --git a/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll b/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll
index b3b696d42c59..9eba0c3051dc 100644
--- a/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll
+++ b/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll
@@ -14,7 +14,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 
 ; CHECK: %scevgep = getelementptr i32, i32 addrspace(3)* %tmp1, i32 4
 ; CHECK:%tmp14 = load i32, i32 addrspace(3)* %scevgep
-define void @lsr_crash_preserve_addrspace_unknown_type() #0 {
+define amdgpu_kernel void @lsr_crash_preserve_addrspace_unknown_type() #0 {
 bb:
   br label %bb1
 
diff --git a/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll b/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
index 788842101080..a9d1e8758766 100644
--- a/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
+++ b/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -O3 -mtriple=thumb-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s -check-prefix=A9
-; RUN: llc -O3 -mtriple=thumb-eabi -mcpu=cortex-a9 -addr-sink-using-gep=1 %s -o - | FileCheck %s -check-prefix=A9
 
 ; @simple is the most basic chain of address induction variables. Chaining
 ; saves at least one register and avoids complex addressing and setup
diff --git a/test/Transforms/LoopStrengthReduce/X86/canonical.ll b/test/Transforms/LoopStrengthReduce/X86/canonical.ll
new file mode 100644
index 000000000000..2dafbb408aad
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/X86/canonical.ll
@@ -0,0 +1,65 @@
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -loop-reduce -S < %s | FileCheck %s
+; Check LSR formula canonicalization will put loop invariant regs before
+; induction variable of current loop, so exprs involving loop invariant regs
+; can be promoted outside of current loop.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo(i32 %size, i32 %nsteps, i8* nocapture %maxarray, i8* nocapture readnone %buffer, i32 %init) local_unnamed_addr #0 {
+entry:
+  %cmp25 = icmp sgt i32 %nsteps, 0
+  br i1 %cmp25, label %for.cond1.preheader.lr.ph, label %for.end12
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %cmp223 = icmp sgt i32 %size, 1
+  %t0 = sext i32 %init to i64
+  %wide.trip.count = zext i32 %size to i64
+  %wide.trip.count31 = zext i32 %nsteps to i64
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc10, %for.cond1.preheader.lr.ph
+  %indvars.iv28 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next29, %for.inc10 ]
+  br i1 %cmp223, label %for.body3.lr.ph, label %for.inc10
+
+for.body3.lr.ph:                                  ; preds = %for.cond1.preheader
+  %t1 = add nsw i64 %indvars.iv28, %t0
+  %t2 = trunc i64 %indvars.iv28 to i8
+  br label %for.body3
+
+; Make sure loop invariant items are grouped together so that load address can
+; be represented in one getelementptr.
+; CHECK-LABEL: for.body3:
+; CHECK-NEXT: [[LSR:%[^,]+]] = phi i64 [ 1, %for.body3.lr.ph ], [ {{.*}}, %for.body3 ]
+; CHECK-NOT: = phi i64
+; CHECK-NEXT: [[LOADADDR:%[^,]+]] = getelementptr i8, i8* {{.*}}, i64 [[LSR]]
+; CHECK-NEXT: = load i8, i8* [[LOADADDR]], align 1
+; CHECK: br i1 %exitcond, label %for.inc10.loopexit, label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
+  %indvars.iv = phi i64 [ 1, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
+  %t5 = trunc i64 %indvars.iv to i8
+  %t3 = add nsw i64 %t1, %indvars.iv
+  %arrayidx = getelementptr inbounds i8, i8* %maxarray, i64 %t3
+  %t4 = load i8, i8* %arrayidx, align 1
+  %add5 = add i8 %t4, %t5
+  %add6 = add i8 %add5, %t2
+  %arrayidx9 = getelementptr inbounds i8, i8* %maxarray, i64 %indvars.iv
+  store i8 %add6, i8* %arrayidx9, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.inc10.loopexit, label %for.body3
+
+for.inc10.loopexit:                               ; preds = %for.body3
+  br label %for.inc10
+
+for.inc10:                                        ; preds = %for.inc10.loopexit, %for.cond1.preheader
+  %indvars.iv.next29 = add nuw nsw i64 %indvars.iv28, 1
+  %exitcond32 = icmp eq i64 %indvars.iv.next29, %wide.trip.count31
+  br i1 %exitcond32, label %for.end12.loopexit, label %for.cond1.preheader
+
+for.end12.loopexit:                               ; preds = %for.inc10
+  br label %for.end12
+
+for.end12:                                        ; preds = %for.end12.loopexit, %entry
+  ret void
+}
diff --git a/test/Transforms/LoopStrengthReduce/X86/incorrect-offset-scaling.ll b/test/Transforms/LoopStrengthReduce/X86/incorrect-offset-scaling.ll
new file mode 100644
index 000000000000..3adb8bcf514d
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/X86/incorrect-offset-scaling.ll
@@ -0,0 +1,48 @@
+; RUN: opt -S -loop-reduce < %s | FileCheck %s
+
+target triple = "x86_64-unknown-unknown"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @incorrect_offset_scaling(i64, i64*) {
+top:
+  br label %L
+
+L:                                                ; preds = %idxend.10, %idxend, %L2, %top
+  br i1 undef, label %L, label %L1
+
+L1:                                               ; preds = %L1.preheader, %L2
+  %r13 = phi i64 [ %r1, %L2 ], [ 1, %L ]
+; CHECK:  %lsr.iv = phi i64 [ 0, %L{{[^ ]+}} ], [ %lsr.iv.next, %L2 ]
+; CHECK-NOT:  %lsr.iv = phi i64 [ -1, %L{{[^ ]+}} ], [ %lsr.iv.next, %L2 ]
+; CHECK:  br
+  %r0 = add i64 %r13, -1
+  br label %idxend.8
+
+L2:                                               ; preds = %idxend.8
+  %r1 = add i64 %r13, 1
+  br i1 undef, label %L, label %L1
+
+if6:                                              ; preds = %idxend.8
+  %r2 = add i64 %0, -1
+  %r3 = load i64, i64* %1, align 8
+; CHECK-NOT:  %r2
+; CHECK:  %r3 = load i64
+  br label %ib
+
+idxend.8:                                         ; preds = %L1
+  br i1 undef, label %if6, label %L2
+
+ib:                                               ; preds = %if6
+  %r4 = mul i64 %r3, %r0
+  %r5 = add i64 %r2, %r4
+  %r6 = icmp ult i64 %r5, undef
+; CHECK:  [[MUL1:%[0-9]+]] = mul i64 %lsr.iv, %r3
+; CHECK:  [[ADD1:%[0-9]+]] = add i64 [[MUL1]], -1
+; CHECK:  add i64 %{{.}}, [[ADD1]]
+; CHECK:  %r6
+  %r7 = getelementptr i64, i64* undef, i64 %r5
+  store i64 1, i64* %r7, align 8
+; CHECK:  [[MUL2:%[0-9]+]] = mul i64 %lsr.iv, %r3
+; CHECK:  [[ADD2:%[0-9]+]] = add i64 [[MUL2]], -1
+  br label %L
+}
diff --git a/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll b/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
index ab7d4f1baa81..fb63b66137f3 100644
--- a/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
@@ -1,7 +1,5 @@
 ; RUN: llc < %s -O3 -march=x86-64 -mcpu=core2 | FileCheck %s -check-prefix=X64
 ; RUN: llc < %s -O3 -march=x86 -mcpu=core2 | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -O3 -march=x86-64 -mcpu=core2 -addr-sink-using-gep=1 | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -O3 -march=x86 -mcpu=core2 -addr-sink-using-gep=1 | FileCheck %s -check-prefix=X32
 
 ; @simple is the most basic chain of address induction variables. Chaining
 ; saves at least one register and avoids complex addressing and setup
diff --git a/test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll b/test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll
new file mode 100644
index 000000000000..4888536bdf81
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s -loop-reduce -mtriple=x86_64 -lsr-insns-cost -S | FileCheck %s -check-prefix=BOTH -check-prefix=INSN
+; RUN: opt < %s -loop-reduce -mtriple=x86_64 -S | FileCheck %s -check-prefix=BOTH -check-prefix=REGS
+; RUN: llc < %s -O2 -march=x86-64 -lsr-insns-cost -asm-verbose=0 | FileCheck %s
+
+; OPT test checks that LSR optimize compare for static counter to compare with 0.
+
+; BOTH: for.body:
+; INSN: icmp eq i64 %lsr.iv.next, 0
+; REGS: icmp eq i64 %indvars.iv.next, 1024
+
+; LLC test checks that LSR optimize compare for static counter.
+; That means that instead of creating the following:
+;   movl %ecx, (%rdx,%rax,4)
+;   incq %rax
+;   cmpq $1024, %rax
+; LSR should optimize out cmp:
+;   movl %ecx, 4096(%rdx,%rax)
+;   addq $4, %rax
+; or
+;   movl %ecx, 4096(%rdx,%rax,4)
+;   incq %rax
+
+; CHECK:      LBB0_1:
+; CHECK-NEXT:   movl 4096(%{{.+}},[[REG:%[0-9a-z]+]]
+; CHECK-NEXT:   addl 4096(%{{.+}},[[REG]]
+; CHECK-NEXT:   movl %{{.+}}, 4096(%{{.+}},[[REG]]
+; CHECK-NOT:    cmp
+; CHECK:        jne
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* nocapture %q) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
+  %tmp1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %tmp1, %tmp
+  %arrayidx4 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
diff --git a/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll b/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll
new file mode 100644
index 000000000000..3273cb4e6b5b
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll
@@ -0,0 +1,58 @@
+; RUN: opt < %s -loop-reduce -mtriple=x86_64 -lsr-insns-cost -S | FileCheck %s -check-prefix=BOTH -check-prefix=INSN
+; RUN: opt < %s -loop-reduce -mtriple=x86_64 -S | FileCheck %s -check-prefix=BOTH -check-prefix=REGS
+; RUN: llc < %s -O2 -march=x86-64 -lsr-insns-cost -asm-verbose=0 | FileCheck %s
+
+; OPT checks that LSR prefers less instructions to less registers.
+; For x86 LSR should prefer complicated address to new lsr induction
+; variables.
+
+; BOTH: for.body:
+; INSN:   getelementptr i32, i32* %x, i64 %indvars.iv
+; INSN:   getelementptr i32, i32* %y, i64 %indvars.iv
+; INSN:   getelementptr i32, i32* %q, i64 %indvars.iv
+; REGS    %lsr.iv4 = phi
+; REGS    %lsr.iv2 = phi
+; REGS    %lsr.iv1 = phi
+; REGS:   getelementptr i32, i32* %lsr.iv1, i64 1
+; REGS:   getelementptr i32, i32* %lsr.iv2, i64 1
+; REGS:   getelementptr i32, i32* %lsr.iv4, i64 1
+
+; LLC checks that LSR prefers less instructions to less registers.
+; LSR should prefer complicated address to additonal add instructions.
+
+; CHECK:      LBB0_2:
+; CHECK-NEXT:   movl (%r{{.+}},
+; CHECK-NEXT:   addl (%r{{.+}},
+; CHECK-NEXT:   movl %e{{.+}}, (%r{{.+}},
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* nocapture %q, i32 %n) {
+entry:
+  %cmp10 = icmp sgt i32 %n, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
+  %tmp1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %tmp1, %tmp
+  %arrayidx4 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/test/Transforms/LoopStrengthReduce/X86/nested-loop.ll b/test/Transforms/LoopStrengthReduce/X86/nested-loop.ll
new file mode 100644
index 000000000000..b563eb3ad994
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/X86/nested-loop.ll
@@ -0,0 +1,65 @@
+; RUN: opt -loop-reduce -S < %s | FileCheck %s
+; Check when we use an outerloop induction variable inside of an innerloop
+; induction value expr, LSR can still choose to use single induction variable
+; for the innerloop and share it in multiple induction value exprs.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo(i32 %size, i32 %nsteps, i32 %hsize, i32* %lined, i8* %maxarray) {
+entry:
+  %cmp215 = icmp sgt i32 %size, 1
+  %t0 = zext i32 %size to i64
+  %t1 = sext i32 %nsteps to i64
+  %sub2 = sub i64 %t0, 2
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %entry
+  %indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc ], [ 0, %entry ]
+  %t2 = mul nsw i64 %indvars.iv2, %t0
+  br i1 %cmp215, label %for.body2.preheader, label %for.inc
+
+for.body2.preheader:                              ; preds = %for.body
+  br label %for.body2
+
+; Check LSR only generates one induction variable for for.body2 and the induction
+; variable will be shared by multiple array accesses.
+; CHECK: for.body2:
+; CHECK-NEXT: [[LSR:%[^,]+]] = phi i64 [ %lsr.iv.next, %for.body2 ], [ 0, %for.body2.preheader ]
+; CHECK-NOT:  = phi i64 [ {{.*}}, %for.body2 ], [ {{.*}}, %for.body2.preheader ]
+; CHECK:      [[SCEVGEP1:%[^,]+]] = getelementptr i8, i8* %maxarray, i64 [[LSR]]
+; CHECK:      [[SCEVGEP2:%[^,]+]] = getelementptr i8, i8* [[SCEVGEP1]], i64 1
+; CHECK:      {{.*}} = load i8, i8* [[SCEVGEP2]], align 1
+; CHECK:      [[SCEVGEP3:%[^,]+]] = getelementptr i8, i8* {{.*}}, i64 [[LSR]]
+; CHECK:      {{.*}} = load i8, i8* [[SCEVGEP3]], align 1
+; CHECK:      [[SCEVGEP4:%[^,]+]] = getelementptr i8, i8* {{.*}}, i64 [[LSR]]
+; CHECK:      store i8 {{.*}}, i8* [[SCEVGEP4]], align 1
+; CHECK:      br i1 %exitcond, label %for.body2, label %for.inc.loopexit
+
+for.body2:                                        ; preds = %for.body2.preheader, %for.body2
+  %indvars.iv = phi i64 [ 1, %for.body2.preheader ], [ %indvars.iv.next, %for.body2 ]
+  %arrayidx1 = getelementptr inbounds i8, i8* %maxarray, i64 %indvars.iv
+  %v1 = load i8, i8* %arrayidx1, align 1
+  %idx2 = add nsw i64 %indvars.iv, %sub2
+  %arrayidx2 = getelementptr inbounds i8, i8* %maxarray, i64 %idx2
+  %v2 = load i8, i8* %arrayidx2, align 1
+  %tmpv = xor i8 %v1, %v2
+  %t4 = add nsw i64 %t2, %indvars.iv
+  %add.ptr = getelementptr inbounds i8, i8* %maxarray, i64 %t4
+  store i8 %tmpv, i8* %add.ptr, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %wide.trip.count = zext i32 %size to i64
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.body2, label %for.inc.loopexit
+
+for.inc.loopexit:                                 ; preds = %for.body2
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.inc.loopexit, %for.body
+  %indvars.iv.next3 = add nuw nsw i64 %indvars.iv2, 1
+  %cmp = icmp slt i64 %indvars.iv.next3, %t1
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.inc
+  ret void
+}
diff --git a/test/Transforms/LoopStrengthReduce/X86/sibling-loops.ll b/test/Transforms/LoopStrengthReduce/X86/sibling-loops.ll
new file mode 100644
index 000000000000..a69d6adc0f03
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/X86/sibling-loops.ll
@@ -0,0 +1,97 @@
+; RUN: opt -loop-reduce -S < %s | FileCheck %s
+; We find it is very bad to allow LSR formula containing SCEVAddRecExpr Reg
+; from siblings of current loop. When one loop is LSR optimized, it can
+; insert lsr.iv for other sibling loops, which sometimes leads to many extra
+; lsr.iv inserted for loops.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@cond = common local_unnamed_addr global i64 0, align 8
+
+; Check there is no extra lsr.iv generated in foo.
+; CHECK-LABEL: @foo(
+; CHECK-NOT: lsr.iv{{[0-9]+}} =
+;
+define void @foo(i64 %N) local_unnamed_addr {
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %i.0 = phi i64 [ 0, %entry ], [ %inc, %do.body ]
+  tail call void @goo(i64 %i.0, i64 %i.0)
+  %inc = add nuw nsw i64 %i.0, 1
+  %t0 = load i64, i64* @cond, align 8
+  %tobool = icmp eq i64 %t0, 0
+  br i1 %tobool, label %do.body2.preheader, label %do.body
+
+do.body2.preheader:                               ; preds = %do.body
+  br label %do.body2
+
+do.body2:                                         ; preds = %do.body2.preheader, %do.body2
+  %i.1 = phi i64 [ %inc3, %do.body2 ], [ 0, %do.body2.preheader ]
+  %j.1 = phi i64 [ %inc4, %do.body2 ], [ %inc, %do.body2.preheader ]
+  tail call void @goo(i64 %i.1, i64 %j.1)
+  %inc3 = add nuw nsw i64 %i.1, 1
+  %inc4 = add nsw i64 %j.1, 1
+  %t1 = load i64, i64* @cond, align 8
+  %tobool6 = icmp eq i64 %t1, 0
+  br i1 %tobool6, label %do.body8.preheader, label %do.body2
+
+do.body8.preheader:                               ; preds = %do.body2
+  br label %do.body8
+
+do.body8:                                         ; preds = %do.body8.preheader, %do.body8
+  %i.2 = phi i64 [ %inc9, %do.body8 ], [ 0, %do.body8.preheader ]
+  %j.2 = phi i64 [ %inc10, %do.body8 ], [ %inc4, %do.body8.preheader ]
+  tail call void @goo(i64 %i.2, i64 %j.2)
+  %inc9 = add nuw nsw i64 %i.2, 1
+  %inc10 = add nsw i64 %j.2, 1
+  %t2 = load i64, i64* @cond, align 8
+  %tobool12 = icmp eq i64 %t2, 0
+  br i1 %tobool12, label %do.body14.preheader, label %do.body8
+
+do.body14.preheader:                              ; preds = %do.body8
+  br label %do.body14
+
+do.body14:                                        ; preds = %do.body14.preheader, %do.body14
+  %i.3 = phi i64 [ %inc15, %do.body14 ], [ 0, %do.body14.preheader ]
+  %j.3 = phi i64 [ %inc16, %do.body14 ], [ %inc10, %do.body14.preheader ]
+  tail call void @goo(i64 %i.3, i64 %j.3)
+  %inc15 = add nuw nsw i64 %i.3, 1
+  %inc16 = add nsw i64 %j.3, 1
+  %t3 = load i64, i64* @cond, align 8
+  %tobool18 = icmp eq i64 %t3, 0
+  br i1 %tobool18, label %do.body20.preheader, label %do.body14
+
+do.body20.preheader:                              ; preds = %do.body14
+  br label %do.body20
+
+do.body20:                                        ; preds = %do.body20.preheader, %do.body20
+  %i.4 = phi i64 [ %inc21, %do.body20 ], [ 0, %do.body20.preheader ]
+  %j.4 = phi i64 [ %inc22, %do.body20 ], [ %inc16, %do.body20.preheader ]
+  tail call void @goo(i64 %i.4, i64 %j.4)
+  %inc21 = add nuw nsw i64 %i.4, 1
+  %inc22 = add nsw i64 %j.4, 1
+  %t4 = load i64, i64* @cond, align 8
+  %tobool24 = icmp eq i64 %t4, 0
+  br i1 %tobool24, label %do.body26.preheader, label %do.body20
+
+do.body26.preheader:                              ; preds = %do.body20
+  br label %do.body26
+
+do.body26:                                        ; preds = %do.body26.preheader, %do.body26
+  %i.5 = phi i64 [ %inc27, %do.body26 ], [ 0, %do.body26.preheader ]
+  %j.5 = phi i64 [ %inc28, %do.body26 ], [ %inc22, %do.body26.preheader ]
+  tail call void @goo(i64 %i.5, i64 %j.5)
+  %inc27 = add nuw nsw i64 %i.5, 1
+  %inc28 = add nsw i64 %j.5, 1
+  %t5 = load i64, i64* @cond, align 8
+  %tobool30 = icmp eq i64 %t5, 0
+  br i1 %tobool30, label %do.end31, label %do.body26
+
+do.end31:                                         ; preds = %do.body26
+  ret void
+}
+
+declare void @goo(i64, i64) local_unnamed_addr
+
diff --git a/test/Transforms/LoopStrengthReduce/quadradic-exit-value.ll b/test/Transforms/LoopStrengthReduce/quadradic-exit-value.ll
deleted file mode 100644
index 09f0e1aa2a09..000000000000
--- a/test/Transforms/LoopStrengthReduce/quadradic-exit-value.ll
+++ /dev/null
@@ -1,54 +0,0 @@
-; RUN: opt < %s -analyze -iv-users | FileCheck %s
-; RUN: opt -passes='function(require<scalar-evolution>,loop(print<ivusers>))' -S < %s 2>&1| FileCheck %s
-
-; Provide legal integer types.
-target datalayout = "n8:16:32:64"
-
-; The value of %r is dependent on a polynomial iteration expression.
-;
-; CHECK-LABEL: IV Users for loop %foo.loop
-; CHECK: {1,+,3,+,2}<%foo.loop>
-define i64 @foo(i64 %n) {
-entry:
-  br label %foo.loop
-
-foo.loop:
-  %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %foo.loop ]
-  %indvar.next = add i64 %indvar, 1
-  %c = icmp eq i64 %indvar.next, %n
-  br i1 %c, label %exit, label %foo.loop
-
-exit:
-  %r = mul i64 %indvar.next, %indvar.next
-  ret i64 %r
-}
-
-; PR15470: LSR miscompile. The test2 function should return '1'.
-;
-; SCEV does not know how to denormalize chained recurrences, so make
-; sure they aren't marked as post-inc users.
-;
-; CHECK-LABEL: IV Users for loop %test2.loop
-; CHECK: %sext.us = {0,+,(16777216 + (-16777216 * %sub.us))<nuw><nsw>,+,33554432}<%test2.loop> in %f = ashr i32 %sext.us, 24
-define i32 @test2() {
-entry:
-  br label %test2.loop
-
-test2.loop:
-  %inc1115.us = phi i32 [ 0, %entry ], [ %inc11.us, %test2.loop ]
-  %inc11.us = add nsw i32 %inc1115.us, 1
-  %cmp.us = icmp slt i32 %inc11.us, 2
-  br i1 %cmp.us, label %test2.loop, label %for.end
-
-for.end:
-  %tobool.us = icmp eq i32 %inc1115.us, 0
-  %sub.us = select i1 %tobool.us, i32 0, i32 0
-  %mul.us = shl i32 %inc1115.us, 24
-  %sub.cond.us = sub nsw i32 %inc1115.us, %sub.us
-  %sext.us = mul i32 %mul.us, %sub.cond.us
-  %f = ashr i32 %sext.us, 24
-  br label %exit
-
-exit:
-  ret i32 %f
-}
diff --git a/test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll b/test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll
index e732ddc2bc84..ca8cc32469d8 100644
--- a/test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll
+++ b/test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll
@@ -6,7 +6,7 @@
 ; CHECK: call void @llvm.amdgcn.s.barrier()
 ; CHECK: call void @llvm.amdgcn.s.barrier()
 ; CHECK-NOT: br
-define void @test_unroll_convergent_barrier(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(1)* noalias nocapture %in) #0 {
+define amdgpu_kernel void @test_unroll_convergent_barrier(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(1)* noalias nocapture %in) #0 {
 entry:
   br label %for.body
 
diff --git a/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll b/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll
new file mode 100644
index 000000000000..e986c3dc2a28
--- /dev/null
+++ b/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll
@@ -0,0 +1,154 @@
+; RUN: opt -mtriple=amdgcn-unknown-amdhsa -loop-unroll -S -amdgpu-unroll-threshold-private=20000 %s | FileCheck %s
+
+; Check that we full unroll loop to be able to eliminate alloca
+; CHECK-LABEL: @non_invariant_ind
+; CHECK:       for.body:
+; CHECK-NOT:   br
+; CHECK:       store i32 %tmp15, i32 addrspace(1)* %arrayidx7, align 4
+; CHECK:       ret void
+
+define amdgpu_kernel void @non_invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) {
+entry:
+  %arr = alloca [64 x i32], align 4
+  %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %arrayidx5 = getelementptr inbounds [64 x i32], [64 x i32]* %arr, i32 0, i32 %x
+  %tmp15 = load i32, i32* %arrayidx5, align 4
+  %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 %tmp1
+  store i32 %tmp15, i32 addrspace(1)* %arrayidx7, align 4
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.015 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %idxprom = sext i32 %i.015 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom
+  %tmp16 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %add = add nsw i32 %i.015, %tmp1
+  %rem = srem i32 %add, 64
+  %arrayidx3 = getelementptr inbounds [64 x i32], [64 x i32]* %arr, i32 0, i32 %rem
+  store i32 %tmp16, i32* %arrayidx3, align 4
+  %inc = add nuw nsw i32 %i.015, 1
+  %exitcond = icmp eq i32 %inc, 100
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; Check that we unroll inner loop but not outer
+; CHECK-LABEL: @invariant_ind
+; CHECK:       %[[exitcond:[^ ]+]] = icmp eq i32 %{{.*}}, 32
+; CHECK:       br i1 %[[exitcond]]
+; CHECK-NOT:   icmp eq i32 %{{.*}}, 100
+
+define amdgpu_kernel void @invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) {
+entry:
+  %arr = alloca [64 x i32], align 4
+  %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  br label %for.cond2.preheader
+
+for.cond2.preheader:                              ; preds = %for.cond.cleanup5, %entry
+  %i.026 = phi i32 [ 0, %entry ], [ %inc10, %for.cond.cleanup5 ]
+  %idxprom = sext i32 %i.026 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom
+  %tmp15 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  br label %for.body6
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup5
+  %arrayidx13 = getelementptr inbounds [64 x i32], [64 x i32]* %arr, i32 0, i32 %x
+  %tmp16 = load i32, i32* %arrayidx13, align 4
+  %arrayidx15 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 %tmp1
+  store i32 %tmp16, i32 addrspace(1)* %arrayidx15, align 4
+  ret void
+
+for.cond.cleanup5:                                ; preds = %for.body6
+  %inc10 = add nuw nsw i32 %i.026, 1
+  %exitcond27 = icmp eq i32 %inc10, 32
+  br i1 %exitcond27, label %for.cond.cleanup, label %for.cond2.preheader
+
+for.body6:                                        ; preds = %for.body6, %for.cond2.preheader
+  %j.025 = phi i32 [ 0, %for.cond2.preheader ], [ %inc, %for.body6 ]
+  %add = add nsw i32 %j.025, %tmp1
+  %rem = srem i32 %add, 64
+  %arrayidx8 = getelementptr inbounds [64 x i32], [64 x i32]* %arr, i32 0, i32 %rem
+  store i32 %tmp15, i32* %arrayidx8, align 4
+  %inc = add nuw nsw i32 %j.025, 1
+  %exitcond = icmp eq i32 %inc, 100
+  br i1 %exitcond, label %for.cond.cleanup5, label %for.body6
+}
+
+; Check we do not enforce unroll if alloca is too big
+; CHECK-LABEL: @too_big
+; CHECK:       for.body:
+; CHECK:       icmp eq i32 %{{.*}}, 100
+; CHECK:       br
+
+define amdgpu_kernel void @too_big(i32 addrspace(1)* nocapture %a, i32 %x) {
+entry:
+  %arr = alloca [256 x i32], align 4
+  %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %arrayidx5 = getelementptr inbounds [256 x i32], [256 x i32]* %arr, i32 0, i32 %x
+  %tmp15 = load i32, i32* %arrayidx5, align 4
+  %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 %tmp1
+  store i32 %tmp15, i32 addrspace(1)* %arrayidx7, align 4
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.015 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %idxprom = sext i32 %i.015 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom
+  %tmp16 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %add = add nsw i32 %i.015, %tmp1
+  %rem = srem i32 %add, 64
+  %arrayidx3 = getelementptr inbounds [256 x i32], [256 x i32]* %arr, i32 0, i32 %rem
+  store i32 %tmp16, i32* %arrayidx3, align 4
+  %inc = add nuw nsw i32 %i.015, 1
+  %exitcond = icmp eq i32 %inc, 100
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; Check we do not enforce unroll if alloca is dynamic
+; CHECK-LABEL: @dynamic_size_alloca(
+; CHECK: alloca i32, i32 %n
+; CHECK:       for.body:
+; CHECK:       icmp eq i32 %{{.*}}, 100
+; CHECK:       br
+
+define amdgpu_kernel void @dynamic_size_alloca(i32 addrspace(1)* nocapture %a, i32 %n, i32 %x) {
+entry:
+  %arr = alloca i32, i32 %n, align 4
+  %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %arrayidx5 = getelementptr inbounds i32, i32* %arr, i32 %x
+  %tmp15 = load i32, i32* %arrayidx5, align 4
+  %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 %tmp1
+  store i32 %tmp15, i32 addrspace(1)* %arrayidx7, align 4
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.015 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %idxprom = sext i32 %i.015 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom
+  %tmp16 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %add = add nsw i32 %i.015, %tmp1
+  %rem = srem i32 %add, 64
+  %arrayidx3 = getelementptr inbounds i32, i32* %arr, i32 %rem
+  store i32 %tmp16, i32* %arrayidx3, align 4
+  %inc = add nuw nsw i32 %i.015, 1
+  %exitcond = icmp eq i32 %inc, 100
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #1
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #1
+
+declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #1
+
+attributes #1 = { nounwind readnone }
diff --git a/test/Transforms/LoopUnroll/basic.ll b/test/Transforms/LoopUnroll/basic.ll
index 2bfd3e6de8fc..e965f2a19c04 100644
--- a/test/Transforms/LoopUnroll/basic.ll
+++ b/test/Transforms/LoopUnroll/basic.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -loop-unroll -S | FileCheck %s
+; RUN: opt < %s -passes='require<opt-remark-emit>,loop(unroll)' -S | FileCheck %s
 
 
 ; This should not unroll since the address of the loop header is taken.
diff --git a/test/Transforms/LoopUnroll/epilog_const_phi.ll b/test/Transforms/LoopUnroll/epilog_const_phi.ll
new file mode 100644
index 000000000000..22e525760942
--- /dev/null
+++ b/test/Transforms/LoopUnroll/epilog_const_phi.ll
@@ -0,0 +1,65 @@
+; RUN: opt -S -loop-unroll -unroll-runtime < %s | FileCheck %s
+
+; Epilog unroll allows to keep PHI constant value.
+; For the test this means that after unroll XOR could be deleted.
+; Check that we do epilogue reminder here.
+
+; CHECK-LABEL: const_phi_val
+; CHECK:  for.body.epil
+
+; Function Attrs: norecurse nounwind uwtable
+define void @const_phi_val(i32 %i0, i32* nocapture %a) {
+entry:
+  %cmp6 = icmp slt i32 %i0, 1000
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %tmp = sext i32 %i0 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ %tmp, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %s.08 = phi i32 [ 0, %for.body.preheader ], [ %xor, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %s.08, i32* %arrayidx, align 4
+  %xor = xor i32 %s.08, 1
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; When there is no phi with const coming from preheader,
+; there is no need to do epilogue unrolling.
+
+; CHECK-LABEL: var_phi_val
+; CHECK:  for.body.prol
+
+; Function Attrs: norecurse nounwind uwtable
+define void @var_phi_val(i32 %i0, i32* nocapture %a) {
+entry:
+  %cmp6 = icmp slt i32 %i0, 1000
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %tmp = sext i32 %i0 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ %tmp, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
diff --git a/test/Transforms/LoopUnroll/full-unroll-bad-cost.ll b/test/Transforms/LoopUnroll/full-unroll-bad-cost.ll
index e5694fbeb0ce..9bbd21accc8e 100644
--- a/test/Transforms/LoopUnroll/full-unroll-bad-cost.ll
+++ b/test/Transforms/LoopUnroll/full-unroll-bad-cost.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-unroll < %s | FileCheck %s
+; RUN: opt < %s -passes='require<opt-remark-emit>,loop(unroll-full)' -S | FileCheck %s
 
 ; LLVM should not try to fully unroll this loop.
 
diff --git a/test/Transforms/LoopUnroll/full-unroll-crashers.ll b/test/Transforms/LoopUnroll/full-unroll-crashers.ll
index 9f1529139de0..d83e56635e8c 100644
--- a/test/Transforms/LoopUnroll/full-unroll-crashers.ll
+++ b/test/Transforms/LoopUnroll/full-unroll-crashers.ll
@@ -1,5 +1,6 @@
 ; Check that we don't crash on corner cases.
 ; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=1 -unroll-max-percent-threshold-boost=200 -o /dev/null
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll-full)' -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=1 -unroll-max-percent-threshold-boost=200 -o /dev/null
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 @known_constant = internal unnamed_addr constant [10 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 16
diff --git a/test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll b/test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll
index 26124fb32ca2..a143056affe7 100644
--- a/test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll
+++ b/test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll-full)' -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 @unknown_global = internal unnamed_addr global [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16
diff --git a/test/Transforms/LoopUnroll/full-unroll-heuristics-cmp.ll b/test/Transforms/LoopUnroll/full-unroll-heuristics-cmp.ll
index 8bddb1b225c2..e65f794286ee 100644
--- a/test/Transforms/LoopUnroll/full-unroll-heuristics-cmp.ll
+++ b/test/Transforms/LoopUnroll/full-unroll-heuristics-cmp.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll-full)' -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 @known_constant = internal unnamed_addr constant [10 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 16
diff --git a/test/Transforms/LoopUnroll/full-unroll-heuristics-dce.ll b/test/Transforms/LoopUnroll/full-unroll-heuristics-dce.ll
index 83c105ca23f5..57de3041f571 100644
--- a/test/Transforms/LoopUnroll/full-unroll-heuristics-dce.ll
+++ b/test/Transforms/LoopUnroll/full-unroll-heuristics-dce.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=12 -unroll-max-percent-threshold-boost=400 | FileCheck %s
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll-full)' -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=12 -unroll-max-percent-threshold-boost=400 | FileCheck %s
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 @known_constant = internal unnamed_addr constant [10 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0], align 16
diff --git a/test/Transforms/LoopUnroll/full-unroll-heuristics-geps.ll b/test/Transforms/LoopUnroll/full-unroll-heuristics-geps.ll
index 230912538d23..238869d120ba 100644
--- a/test/Transforms/LoopUnroll/full-unroll-heuristics-geps.ll
+++ b/test/Transforms/LoopUnroll/full-unroll-heuristics-geps.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll-full)' -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 ; When examining gep-instructions we shouldn't consider them simplified if the
diff --git a/test/Transforms/LoopUnroll/full-unroll-heuristics-phi-prop.ll b/test/Transforms/LoopUnroll/full-unroll-heuristics-phi-prop.ll
index a1fab3cc71e1..aa517cb1589d 100644
--- a/test/Transforms/LoopUnroll/full-unroll-heuristics-phi-prop.ll
+++ b/test/Transforms/LoopUnroll/full-unroll-heuristics-phi-prop.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll-full)' -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 define i64 @propagate_loop_phis() {
diff --git a/test/Transforms/LoopUnroll/full-unroll-heuristics.ll b/test/Transforms/LoopUnroll/full-unroll-heuristics.ll
index 7189fbb34833..a2fe1f5e3fda 100644
--- a/test/Transforms/LoopUnroll/full-unroll-heuristics.ll
+++ b/test/Transforms/LoopUnroll/full-unroll-heuristics.ll
@@ -21,6 +21,15 @@
 ; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=20 -unroll-max-percent-threshold-boost=200 | FileCheck %s -check-prefix=TEST2
 ; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=20 -unroll-max-percent-threshold-boost=100 | FileCheck %s -check-prefix=TEST3
 
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll-full)' -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-max-percent-threshold-boost=100 | FileCheck %s -check-prefix=TEST1
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll-full)' -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=20 -unroll-max-percent-threshold-boost=200 | FileCheck %s -check-prefix=TEST2
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll-full)' -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=20 -unroll-max-percent-threshold-boost=100 | FileCheck %s -check-prefix=TEST3
+
+; Check that these work when the unroller has partial unrolling enabled too.
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll)' -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-max-percent-threshold-boost=100 | FileCheck %s -check-prefix=TEST1
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll)' -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=20 -unroll-max-percent-threshold-boost=200 | FileCheck %s -check-prefix=TEST2
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll)' -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=20 -unroll-max-percent-threshold-boost=100 | FileCheck %s -check-prefix=TEST3
+
 ; If the absolute threshold is too low, we should not unroll:
 ; TEST1: %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv
 
@@ -32,6 +41,7 @@
 
 ; And check that we don't crash when we're not allowed to do any analysis.
 ; RUN: opt < %s -loop-unroll -unroll-max-iteration-count-to-analyze=0 -disable-output
+; RUN: opt < %s -passes='require<opt-remark-emit>,loop(unroll-full)' -unroll-max-iteration-count-to-analyze=0 -disable-output
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 @known_constant = internal unnamed_addr constant [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16
diff --git a/test/Transforms/LoopUnroll/full-unroll-keep-first-exit.ll b/test/Transforms/LoopUnroll/full-unroll-keep-first-exit.ll
index e70ff4156d35..682d1b35c1fa 100644
--- a/test/Transforms/LoopUnroll/full-unroll-keep-first-exit.ll
+++ b/test/Transforms/LoopUnroll/full-unroll-keep-first-exit.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-unroll < %s | FileCheck %s
+; RUN: opt -S -passes='require<opt-remark-emit>,loop(unroll-full)' < %s | FileCheck %s
 
 ; Unroll twice, with first loop exit kept
 ; CHECK-LABEL: @s32_max1
diff --git a/test/Transforms/LoopUnroll/partial-unroll-const-bounds.ll b/test/Transforms/LoopUnroll/partial-unroll-const-bounds.ll
index 49c823a28c7f..8e5a866f8ca7 100644
--- a/test/Transforms/LoopUnroll/partial-unroll-const-bounds.ll
+++ b/test/Transforms/LoopUnroll/partial-unroll-const-bounds.ll
@@ -1,4 +1,8 @@
-; RUN: opt < %s -S -unroll-threshold=20 -loop-unroll -unroll-allow-partial -unroll-runtime -unroll-allow-remainder -unroll-max-percent-threshold-boost=100 | FileCheck %s
+; RUN: opt < %s -S -unroll-partial-threshold=20 -unroll-threshold=20 -loop-unroll -unroll-allow-partial -unroll-runtime -unroll-allow-remainder -unroll-max-percent-threshold-boost=100 | FileCheck %s
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll)' -unroll-partial-threshold=20 -unroll-threshold=20 -unroll-allow-partial -unroll-runtime -unroll-allow-remainder -unroll-max-percent-threshold-boost=100 | FileCheck %s
+;
+; Also check that the simple unroller doesn't allow the partial unrolling.
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll-full)' -unroll-partial-threshold=20 -unroll-threshold=20 -unroll-allow-partial -unroll-runtime -unroll-allow-remainder -unroll-max-percent-threshold-boost=100 | FileCheck %s --check-prefix=CHECK-NO-UNROLL
 
 ; The Loop TripCount is 9. However unroll factors 3 or 9 exceed given threshold.
 ; The test checks that we choose a smaller, power-of-two, unroll count and do not give up on unrolling.
@@ -8,6 +12,10 @@
 ; CHECK: for.body.1:
 ; CHECK: store
 
+; CHECK-NO-UNROLL: for.body:
+; CHECK-NO-UNROLL: store
+; CHECK-NO-UNROLL-NOT: store
+
 define void @foo(i32* nocapture %a, i32* nocapture readonly %b) nounwind uwtable {
 entry:
   br label %for.body
diff --git a/test/Transforms/LoopUnroll/peel-loop-irreducible.ll b/test/Transforms/LoopUnroll/peel-loop-irreducible.ll
new file mode 100644
index 000000000000..32a7a0732e10
--- /dev/null
+++ b/test/Transforms/LoopUnroll/peel-loop-irreducible.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -S -loop-unroll -unroll-force-peel-count=1 | FileCheck %s
+
+; Check we don't peel loops where the latch is not the exiting block.
+; CHECK-LABEL: @invariant_backedge_irreducible
+; CHECK: entry:
+; CHECK: br label %header
+; CHECK-NOT: peel
+; CHECK: header:
+; CHECK: br i1 {{.*}} label %latch, label %exiting
+; CHECK: latch:
+; CHECK: br i1 {{.*}} label %header, label %exiting
+; CHECK: exiting:
+; CHECK: br i1 {{.*}} label %latch, label %exit
+
+define i32 @invariant_backedge_irreducible(i32 %a, i32 %b) {
+entry:
+  br label %header
+
+header:
+  %i = phi i32 [ 0, %entry ], [ %inc, %latch ]
+  %cmp.phi = phi i1 [ false, %entry ], [ %cmp, %latch ]
+  br i1 %cmp.phi, label %latch, label %exiting
+
+latch:
+  %inc = add i32 %i, 1
+  %cmp = icmp slt i32 %i, 1000
+  br i1 %cmp, label %header, label %exiting
+
+exiting:
+  %cmp.exiting = phi i1 [ %cmp.phi, %header ], [ %cmp, %latch ]
+  br i1 %cmp.exiting, label %latch, label %exit
+
+exit:
+  ret i32 0
+}
+
diff --git a/test/Transforms/LoopUnroll/peel-loop-not-forced.ll b/test/Transforms/LoopUnroll/peel-loop-not-forced.ll
new file mode 100644
index 000000000000..3dcac87f8242
--- /dev/null
+++ b/test/Transforms/LoopUnroll/peel-loop-not-forced.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -S -loop-unroll -unroll-threshold=4 | FileCheck %s
+
+define i32 @invariant_backedge_1(i32 %a, i32 %b) {
+; CHECK-LABEL: @invariant_backedge_1
+; CHECK-NOT:     %plus = phi
+; CHECK:       loop.peel:
+; CHECK:       loop:
+; CHECK:         %i = phi
+; CHECK:         %sum = phi
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %inc, %loop ]
+  %sum = phi i32 [ 0, %entry ], [ %incsum, %loop ]
+  %plus = phi i32 [ %a, %entry ], [ %b, %loop ]
+
+  %incsum = add i32 %sum, %plus
+  %inc = add i32 %i, 1
+  %cmp = icmp slt i32 %i, 1000
+
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i32 %sum
+}
+
+; Peeling should fail due to method size.
+define i32 @invariant_backedge_2(i32 %a, i32 %b) {
+; CHECK-LABEL: @invariant_backedge_2
+; CHECK-NOT:   loop.peel:
+; CHECK:       loop:
+; CHECK:         %i = phi
+; CHECK:         %sum = phi
+; CHECK:         %plus = phi
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %inc, %loop ]
+  %sum = phi i32 [ 0, %entry ], [ %incsum2, %loop ]
+  %plus = phi i32 [ %a, %entry ], [ %b, %loop ]
+
+  %incsum = add i32 %sum, %plus
+  %incsum2 = add i32 %incsum, %plus
+  %inc = add i32 %i, 1
+  %cmp = icmp slt i32 %i, 1000
+
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i32 %sum
+}
diff --git a/test/Transforms/LoopUnroll/peel-loop-pgo.ll b/test/Transforms/LoopUnroll/peel-loop-pgo.ll
index a87d5643e7e9..20c3878d03a7 100644
--- a/test/Transforms/LoopUnroll/peel-loop-pgo.ll
+++ b/test/Transforms/LoopUnroll/peel-loop-pgo.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -S -debug-only=loop-unroll -loop-unroll -unroll-allow-peeling 2>&1 | FileCheck %s
+; RUN: opt < %s -S -debug-only=loop-unroll -loop-unroll 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 ; Make sure we use the profile information correctly to peel-off 3 iterations
diff --git a/test/Transforms/LoopUnroll/peel-loop.ll b/test/Transforms/LoopUnroll/peel-loop.ll
index 249122022387..bf0801fc760a 100644
--- a/test/Transforms/LoopUnroll/peel-loop.ll
+++ b/test/Transforms/LoopUnroll/peel-loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -S -loop-unroll -unroll-force-peel-count=3 -simplifycfg -instcombine | FileCheck %s
+; RUN: opt < %s -S -loop-unroll -unroll-force-peel-count=3 -verify-dom-info -simplifycfg -instcombine | FileCheck %s
 
 ; Basic loop peeling - check that we can peel-off the first 3 loop iterations
 ; when explicitly requested.
diff --git a/test/Transforms/LoopUnroll/peel-loop2.ll b/test/Transforms/LoopUnroll/peel-loop2.ll
new file mode 100644
index 000000000000..99e90797e199
--- /dev/null
+++ b/test/Transforms/LoopUnroll/peel-loop2.ll
@@ -0,0 +1,61 @@
+; RUN: opt -S -loop-unroll -unroll-force-peel-count=1 -verify-dom-info <%s
+
+; Check if loop composed of several BBs is peeled correctly.
+
+declare void @funcb()
+@Comma = external global i8
+define void @funca(i8* readnone %b, i8* readnone %e) {
+entry:
+  %cmp2 = icmp eq i8* %b, %e
+  br i1 %cmp2, label %for.end, label %for.body.preheader
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %b.addr.03 = phi i8* [ %incdec.ptr, %for.inc ], [ %b, %for.body.preheader ]
+  %0 = load i8, i8* @Comma
+  %tobool = icmp eq i8 %0, 0
+  br i1 %tobool, label %for.inc, label %if.then
+
+if.then:
+  tail call void @funcb()
+  store i8 1, i8* @Comma
+  br label %for.inc
+
+for.inc:
+  %incdec.ptr = getelementptr inbounds i8, i8* %b.addr.03, i64 1
+  %cmp = icmp eq i8* %incdec.ptr, %e
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; CHECK_LABEL: @funca
+
+; Peeled iteration
+; CHECK: %[[REG1:[0-9]+]] = load i8, i8* @Comma
+; CHECK: %[[REG2:.*]] = icmp eq i8 %[[REG1]], 0
+; CHECK: br i1 %[[REG2]], label %{{.*}}, label %[[IFTHEN:.*]]
+; CHECK: [[IFTHEN]]:
+; CHECK: call void @funcb()
+; CHECK: store i8 1, i8* @Comma
+; CHECK: br label %[[FORINC]]
+; CHECK: [[FORINC]]:
+; CHECK: %[[REG3:.*]] = getelementptr inbounds i8, i8* %b, i64 1
+; CHECK: %[[REG4:.*]] = icmp eq i8* %[[REG3]], %e
+; CHECK: br i1 %[[REG4]]
+
+; main body
+; CHECK: %[[REG1b:.*]] = load i8, i8* @Comma
+; CHECK: %[[REG2b:.*]] = icmp eq i8 %[[REG1b]], 0
+; CHECK: br i1 %[[REG2b]], label %{{.*}}, label %[[IFTHENb:.*]]
+; CHECK: [[IFTHENb]]:
+; CHECK: call void @funcb()
+; CHECK: store i8 1, i8* @Comma
+; CHECK: br label %[[FORINCb]]
+; CHECK: [[FORINCb]]:
+; CHECK: %[[REG3b:.*]] = getelementptr inbounds i8, i8* %b, i64 1
+; CHECK: %[[REG4b:.*]] = icmp eq i8* %[[REG3b]], %e
+; CHECK: br i1 %[[REG4b]]
diff --git a/test/Transforms/LoopUnroll/pr31718.ll b/test/Transforms/LoopUnroll/pr31718.ll
new file mode 100644
index 000000000000..014ef7e501ec
--- /dev/null
+++ b/test/Transforms/LoopUnroll/pr31718.ll
@@ -0,0 +1,55 @@
+; RUN: opt -loop-unroll -verify-loop-lcssa -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@b = external local_unnamed_addr global i32, align 4
+
+; CHECK-LABEL: @main
+; CHECK: exit.loopexit:
+; CHECK: {{.*}} = phi i32 [ %d.0, %h3 ]
+; CHECK: br label %exit
+; CHECK: exit.loopexit1:
+; CHECK: {{.*}} = phi i32 [ %d.0, %h3.1 ]
+; CHECK: br label %exit
+
+define void @main() local_unnamed_addr #0 {
+ph1:
+  br label %h1
+
+h1:
+  %d.0 = phi i32 [ %1, %latch1 ], [ undef, %ph1 ]
+  br label %ph2
+
+ph2:
+  br label %h2
+
+h2:
+  %0 = phi i32 [ 0, %ph2 ], [ %inc, %latch2 ]
+  br label %h3
+
+h3:
+  br i1 undef, label %latch3, label %exit
+
+latch3:
+  br i1 false, label %exit3, label %h3
+
+exit3:
+  br label %latch2
+
+latch2:
+  %inc = add nuw nsw i32 %0, 1
+  %cmp = icmp slt i32 %inc, 2
+  br i1 %cmp, label %h2, label %exit2
+
+exit2:
+  br i1 undef, label %latch1, label %ph2
+
+latch1:                 ; preds = %exit2
+  %1 = load i32, i32* @b, align 4
+  br label %h1
+
+exit:
+  %d.0.lcssa = phi i32 [ %d.0, %h3 ]
+  ret void
+}
diff --git a/test/Transforms/LoopUnroll/revisit.ll b/test/Transforms/LoopUnroll/revisit.ll
new file mode 100644
index 000000000000..fddf6cd1c4e8
--- /dev/null
+++ b/test/Transforms/LoopUnroll/revisit.ll
@@ -0,0 +1,156 @@
+; This test checks that nested loops are revisited in various scenarios when
+; unrolling. Note that if we ever start doing outer loop peeling a test case
+; for that should be added here that will look essentially like a hybrid of the
+; current two cases.
+;
+; RUN: opt < %s -disable-output -debug-pass-manager 2>&1 \
+; RUN:     -passes='require<opt-remark-emit>,loop(unroll)' \
+; RUN:     | FileCheck %s
+;
+; Also run in a special mode that visits children.
+; RUN: opt < %s -disable-output -debug-pass-manager -unroll-revisit-child-loops 2>&1 \
+; RUN:     -passes='require<opt-remark-emit>,loop(unroll)' \
+; RUN:     | FileCheck %s --check-prefixes=CHECK,CHECK-CHILDREN
+
+; Basic test is fully unrolled and we revisit the post-unroll new sibling
+; loops, including the ones that used to be child loops.
+define void @full_unroll(i1* %ptr) {
+; CHECK-LABEL: FunctionToLoopPassAdaptor{{.*}} on full_unroll
+; CHECK-NOT: LoopUnrollPass
+
+entry:
+  br label %l0
+
+l0:
+  %cond.0 = load volatile i1, i1* %ptr
+  br i1 %cond.0, label %l0.0.ph, label %exit
+
+l0.0.ph:
+  br label %l0.0
+
+l0.0:
+  %iv = phi i32 [ %iv.next, %l0.0.latch ], [ 0, %l0.0.ph ]
+  %iv.next = add i32 %iv, 1
+  br label %l0.0.0.ph
+
+l0.0.0.ph:
+  br label %l0.0.0
+
+l0.0.0:
+  %cond.0.0.0 = load volatile i1, i1* %ptr
+  br i1 %cond.0.0.0, label %l0.0.0, label %l0.0.1.ph
+; CHECK: LoopUnrollPass on Loop at depth 3 containing: %l0.0.0<header>
+; CHECK-NOT: LoopUnrollPass
+
+l0.0.1.ph:
+  br label %l0.0.1
+
+l0.0.1:
+  %cond.0.0.1 = load volatile i1, i1* %ptr
+  br i1 %cond.0.0.1, label %l0.0.1, label %l0.0.latch
+; CHECK: LoopUnrollPass on Loop at depth 3 containing: %l0.0.1<header>
+; CHECK-NOT: LoopUnrollPass
+
+l0.0.latch:
+  %cmp = icmp slt i32 %iv.next, 2
+  br i1 %cmp, label %l0.0, label %l0.latch
+; CHECK: LoopUnrollPass on Loop at depth 2 containing: %l0.0
+; CHECK-NOT: LoopUnrollPass
+;
+; Unrolling occurs, so we visit what were the inner loops twice over. First we
+; visit their clones, and then we visit the original loops re-parented.
+; CHECK: LoopUnrollPass on Loop at depth 2 containing: %l0.0.1.1<header>
+; CHECK-NOT: LoopUnrollPass
+; CHECK: LoopUnrollPass on Loop at depth 2 containing: %l0.0.0.1<header>
+; CHECK-NOT: LoopUnrollPass
+; CHECK: LoopUnrollPass on Loop at depth 2 containing: %l0.0.1<header>
+; CHECK-NOT: LoopUnrollPass
+; CHECK: LoopUnrollPass on Loop at depth 2 containing: %l0.0.0<header>
+; CHECK-NOT: LoopUnrollPass
+
+l0.latch:
+  br label %l0
+; CHECK: LoopUnrollPass on Loop at depth 1 containing: %l0<header>
+; CHECK-NOT: LoopUnrollPass
+
+exit:
+  ret void
+}
+
+; Now we test forced runtime partial unrolling with metadata. Here we end up
+; duplicating child loops without changing their structure and so they aren't by
+; default visited, but will be visited with a special parameter.
+define void @partial_unroll(i32 %count, i1* %ptr) {
+; CHECK-LABEL: FunctionToLoopPassAdaptor{{.*}} on partial_unroll
+; CHECK-NOT: LoopUnrollPass
+
+entry:
+  br label %l0
+
+l0:
+  %cond.0 = load volatile i1, i1* %ptr
+  br i1 %cond.0, label %l0.0.ph, label %exit
+
+l0.0.ph:
+  br label %l0.0
+
+l0.0:
+  %iv = phi i32 [ %iv.next, %l0.0.latch ], [ 0, %l0.0.ph ]
+  %iv.next = add i32 %iv, 1
+  br label %l0.0.0.ph
+
+l0.0.0.ph:
+  br label %l0.0.0
+
+l0.0.0:
+  %cond.0.0.0 = load volatile i1, i1* %ptr
+  br i1 %cond.0.0.0, label %l0.0.0, label %l0.0.1.ph
+; CHECK: LoopUnrollPass on Loop at depth 3 containing: %l0.0.0<header>
+; CHECK-NOT: LoopUnrollPass
+
+l0.0.1.ph:
+  br label %l0.0.1
+
+l0.0.1:
+  %cond.0.0.1 = load volatile i1, i1* %ptr
+  br i1 %cond.0.0.1, label %l0.0.1, label %l0.0.latch
+; CHECK: LoopUnrollPass on Loop at depth 3 containing: %l0.0.1<header>
+; CHECK-NOT: LoopUnrollPass
+
+l0.0.latch:
+  %cmp = icmp slt i32 %iv.next, %count
+  br i1 %cmp, label %l0.0, label %l0.latch, !llvm.loop !1
+; CHECK: LoopUnrollPass on Loop at depth 2 containing: %l0.0
+; CHECK-NOT: LoopUnrollPass
+;
+; Partial unrolling occurs which introduces both new child loops and new sibling
+; loops. We only visit the child loops in a special mode, not by default.
+; CHECK-CHILDREN: LoopUnrollPass on Loop at depth 3 containing: %l0.0.0<header>
+; CHECK-CHILDREN-NOT: LoopUnrollPass
+; CHECK-CHILDREN: LoopUnrollPass on Loop at depth 3 containing: %l0.0.1<header>
+; CHECK-CHILDREN-NOT: LoopUnrollPass
+; CHECK-CHILDREN: LoopUnrollPass on Loop at depth 3 containing: %l0.0.0.1<header>
+; CHECK-CHILDREN-NOT: LoopUnrollPass
+; CHECK-CHILDREN: LoopUnrollPass on Loop at depth 3 containing: %l0.0.1.1<header>
+; CHECK-CHILDREN-NOT: LoopUnrollPass
+;
+; When we revisit children, we also revisit the current loop.
+; CHECK-CHILDREN: LoopUnrollPass on Loop at depth 2 containing: %l0.0<header>
+; CHECK-CHILDREN-NOT: LoopUnrollPass
+;
+; Revisit the children of the outer loop that are part of the epilogue.
+; 
+; CHECK: LoopUnrollPass on Loop at depth 2 containing: %l0.0.0.epil<header>
+; CHECK-NOT: LoopUnrollPass
+; CHECK: LoopUnrollPass on Loop at depth 2 containing: %l0.0.1.epil<header>
+; CHECK-NOT: LoopUnrollPass
+l0.latch:
+  br label %l0
+; CHECK: LoopUnrollPass on Loop at depth 1 containing: %l0<header>
+; CHECK-NOT: LoopUnrollPass
+
+exit:
+  ret void
+}
+!1 = !{!1, !2}
+!2 = !{!"llvm.loop.unroll.count", i32 2}
diff --git a/test/Transforms/LoopUnroll/runtime-loop.ll b/test/Transforms/LoopUnroll/runtime-loop.ll
index b5299bb17f82..04661314eb1d 100644
--- a/test/Transforms/LoopUnroll/runtime-loop.ll
+++ b/test/Transforms/LoopUnroll/runtime-loop.ll
@@ -1,6 +1,9 @@
 ; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true  | FileCheck %s -check-prefix=EPILOG
 ; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
 
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll)' -unroll-runtime=true -unroll-runtime-epilog=true  | FileCheck %s -check-prefix=EPILOG
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll)' -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
+
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Tests for unrolling loops with run-time trip counts
diff --git a/test/Transforms/LoopUnroll/runtime-loop1.ll b/test/Transforms/LoopUnroll/runtime-loop1.ll
index 5d7c64824788..d32c83571b5a 100644
--- a/test/Transforms/LoopUnroll/runtime-loop1.ll
+++ b/test/Transforms/LoopUnroll/runtime-loop1.ll
@@ -1,6 +1,9 @@
 ; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-count=2 -unroll-runtime-epilog=true | FileCheck %s -check-prefix=EPILOG
 ; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-count=2 -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
 
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll)' -unroll-runtime -unroll-count=2 -unroll-runtime-epilog=true | FileCheck %s -check-prefix=EPILOG
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll)' -unroll-runtime -unroll-count=2 -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
+
 ; This tests that setting the unroll count works
 
 
diff --git a/test/Transforms/LoopUnroll/runtime-loop2.ll b/test/Transforms/LoopUnroll/runtime-loop2.ll
index 3ce8702a9463..7e7fb9787130 100644
--- a/test/Transforms/LoopUnroll/runtime-loop2.ll
+++ b/test/Transforms/LoopUnroll/runtime-loop2.ll
@@ -1,5 +1,8 @@
-; RUN: opt < %s -S -loop-unroll -unroll-threshold=25 -unroll-runtime -unroll-runtime-epilog=true  -unroll-count=8 | FileCheck %s  -check-prefix=EPILOG
-; RUN: opt < %s -S -loop-unroll -unroll-threshold=25 -unroll-runtime -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
+; RUN: opt < %s -S -loop-unroll -unroll-threshold=25 -unroll-partial-threshold=25 -unroll-runtime -unroll-runtime-epilog=true  -unroll-count=8 | FileCheck %s  -check-prefix=EPILOG
+; RUN: opt < %s -S -loop-unroll -unroll-threshold=25 -unroll-partial-threshold=25 -unroll-runtime -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
+
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll)' -unroll-threshold=25 -unroll-partial-threshold=25 -unroll-runtime -unroll-runtime-epilog=true  -unroll-count=8 | FileCheck %s  -check-prefix=EPILOG
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll)' -unroll-threshold=25 -unroll-partial-threshold=25 -unroll-runtime -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
 
 ; Choose a smaller, power-of-two, unroll count if the loop is too large.
 ; This test makes sure we're not unrolling 'odd' counts
diff --git a/test/Transforms/LoopUnroll/runtime-loop3.ll b/test/Transforms/LoopUnroll/runtime-loop3.ll
index fd13ebfa0b84..253993ee42d4 100644
--- a/test/Transforms/LoopUnroll/runtime-loop3.ll
+++ b/test/Transforms/LoopUnroll/runtime-loop3.ll
@@ -1,5 +1,6 @@
 ; REQUIRES: asserts
-; RUN: opt < %s -disable-output -stats -loop-unroll -unroll-runtime -unroll-threshold=400 -info-output-file - | FileCheck %s --check-prefix=STATS
+; RUN: opt < %s -disable-output -stats -loop-unroll -unroll-runtime -unroll-partial-threshold=200 -unroll-threshold=400 -info-output-file - | FileCheck %s --check-prefix=STATS
+; RUN: opt < %s -disable-output -stats -passes='require<opt-remark-emit>,loop(unroll)' -unroll-runtime -unroll-partial-threshold=200 -unroll-threshold=400 -info-output-file - | FileCheck %s --check-prefix=STATS
 
 ; Test that nested loops can be unrolled.  We need to increase threshold to do it
 
diff --git a/test/Transforms/LoopUnroll/runtime-loop5.ll b/test/Transforms/LoopUnroll/runtime-loop5.ll
index e8d51775ce18..86a26baca657 100644
--- a/test/Transforms/LoopUnroll/runtime-loop5.ll
+++ b/test/Transforms/LoopUnroll/runtime-loop5.ll
@@ -1,6 +1,9 @@
 ; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-count=16 | FileCheck --check-prefix=UNROLL-16 %s
 ; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-count=4 | FileCheck --check-prefix=UNROLL-4 %s
 
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll)' -unroll-runtime=true -unroll-count=16 | FileCheck --check-prefix=UNROLL-16 %s
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll)' -unroll-runtime=true -unroll-count=4 | FileCheck --check-prefix=UNROLL-4 %s
+
 ; Given that the trip-count of this loop is a 3-bit value, we cannot
 ; safely unroll it with a count of anything more than 8.
 
@@ -11,9 +14,6 @@ entry:
   %cmp1 = icmp eq i3 %n, 0
   br i1 %cmp1, label %for.end, label %for.body
 
-; UNROLL-16-NOT: for.body.prol:
-; UNROLL-4: for.body.prol:
-
 for.body:                                         ; preds = %for.body, %entry
 ; UNROLL-16-LABEL: for.body:
 ; UNROLL-4-LABEL: for.body:
@@ -39,6 +39,10 @@ for.body:                                         ; preds = %for.body, %entry
 
 ; UNROLL-16-LABEL: for.end
 ; UNROLL-4-LABEL: for.end
+
+; UNROLL-16-NOT: for.body.epil:
+; UNROLL-4: for.body.epil:
+
 for.end:                                          ; preds = %for.body, %entry
   %sum.0.lcssa = phi i3 [ 0, %entry ], [ %add, %for.body ]
   ret i3 %sum.0.lcssa
diff --git a/test/Transforms/LoopUnroll/unloop.ll b/test/Transforms/LoopUnroll/unloop.ll
index db7bad5322c5..6af13a55d6b9 100644
--- a/test/Transforms/LoopUnroll/unloop.ll
+++ b/test/Transforms/LoopUnroll/unloop.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -S -loop-unroll -verify-loop-info | FileCheck %s
-; RUN: opt < %s -S -passes='function(require<scalar-evolution>,require<targetir>,require<opt-remark-emit>,loop(unroll),verify<loops>)' | FileCheck %s
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll),verify<loops>' | FileCheck %s
 ;
 ; Unit tests for LoopInfo::markAsRemoved.
 
diff --git a/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll b/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll
index f7add40b9d15..6778a52b3af8 100644
--- a/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll
+++ b/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll
@@ -3,12 +3,12 @@
 @known_constant = internal unnamed_addr constant [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16
 
 ; CHECK-LABEL: @bar_prof
-; CHECK: loop.prol:
 ; CHECK: loop:
 ; CHECK: %mul = mul
 ; CHECK: %mul.1 = mul
 ; CHECK: %mul.2 = mul
 ; CHECK: %mul.3 = mul
+; CHECK: loop.epil:
 define i32 @bar_prof(i32* noalias nocapture readonly %src, i64 %c) !prof !1 {
 entry:
   br label %loop
@@ -32,7 +32,7 @@ loop.end:
 }
 
 ; CHECK-LABEL: @bar_prof_flat
-; CHECK-NOT: loop.prol
+; CHECK-NOT: loop.epil
 define i32 @bar_prof_flat(i32* noalias nocapture readonly %src, i64 %c) !prof !1 {
 entry:
   br label %loop
diff --git a/test/Transforms/LoopUnroll/unroll-pragmas.ll b/test/Transforms/LoopUnroll/unroll-pragmas.ll
index 2843e627b3c1..88f32c92d694 100644
--- a/test/Transforms/LoopUnroll/unroll-pragmas.ll
+++ b/test/Transforms/LoopUnroll/unroll-pragmas.ll
@@ -171,10 +171,6 @@ for.end:                                          ; preds = %for.body, %entry
 ; should be duplicated (original and 4x unrolled).
 ;
 ; CHECK-LABEL: @runtime_loop_with_count4(
-; CHECK: for.body.prol:
-; CHECK: store
-; CHECK-NOT: store
-; CHECK: br i1
 ; CHECK: for.body
 ; CHECK: store
 ; CHECK: store
@@ -182,6 +178,10 @@ for.end:                                          ; preds = %for.body, %entry
 ; CHECK: store
 ; CHECK-NOT: store
 ; CHECK: br i1
+; CHECK: for.body.epil:
+; CHECK: store
+; CHECK-NOT: store
+; CHECK: br i1
 define void @runtime_loop_with_count4(i32* nocapture %a, i32 %b) {
 entry:
   %cmp3 = icmp sgt i32 %b, 0
@@ -287,10 +287,6 @@ for.end:                                          ; preds = %for.body
 ; (original and 8x).
 ;
 ; CHECK-LABEL: @runtime_loop_with_enable(
-; CHECK: for.body.prol:
-; CHECK: store
-; CHECK-NOT: store
-; CHECK: br i1
 ; CHECK: for.body:
 ; CHECK: store i32
 ; CHECK: store i32
@@ -302,6 +298,10 @@ for.end:                                          ; preds = %for.body
 ; CHECK: store i32
 ; CHECK-NOT: store i32
 ; CHECK: br i1
+; CHECK: for.body.epil:
+; CHECK: store
+; CHECK-NOT: store
+; CHECK: br i1
 define void @runtime_loop_with_enable(i32* nocapture %a, i32 %b) {
 entry:
   %cmp3 = icmp sgt i32 %b, 0
@@ -328,16 +328,16 @@ for.end:                                          ; preds = %for.body, %entry
 ; should be duplicated (original and 3x unrolled).
 ;
 ; CHECK-LABEL: @runtime_loop_with_count3(
-; CHECK: for.body.prol:
-; CHECK: store
-; CHECK-NOT: store
-; CHECK: br i1
 ; CHECK: for.body
 ; CHECK: store
 ; CHECK: store
 ; CHECK: store
 ; CHECK-NOT: store
 ; CHECK: br i1
+; CHECK: for.body.epil:
+; CHECK: store
+; CHECK-NOT: store
+; CHECK: br i1
 define void @runtime_loop_with_count3(i32* nocapture %a, i32 %b) {
 entry:
   %cmp3 = icmp sgt i32 %b, 0
diff --git a/test/Transforms/LoopUnroll/update-loop-info-in-subloops.ll b/test/Transforms/LoopUnroll/update-loop-info-in-subloops.ll
index adbf47defe8f..6748ebefa522 100644
--- a/test/Transforms/LoopUnroll/update-loop-info-in-subloops.ll
+++ b/test/Transforms/LoopUnroll/update-loop-info-in-subloops.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S < %s -loop-unroll -block-freq | FileCheck %s
+; RUN: opt -S < %s -passes='require<opt-remark-emit>,loop(unroll),require<block-freq>' | FileCheck %s
 ; Crasher from PR20987.
 
 ; CHECK: define void @update_loop_info_in_subloops
diff --git a/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll b/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll
new file mode 100644
index 000000000000..1f106bd894a8
--- /dev/null
+++ b/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll
@@ -0,0 +1,85 @@
+; RUN: opt -mtriple=amdgcn-- -O3 -S %s | FileCheck %s
+
+; Check that loop unswitch happened and condition hoisted out of the loop.
+; Condition is uniform so all targets should perform unswitching.
+
+; CHECK-LABEL: {{^}}define amdgpu_kernel void @uniform_unswitch
+; CHECK: entry:
+; CHECK-NEXT: [[LOOP_COND:%[a-z0-9]+]] = icmp
+; CHECK-NEXT: [[IF_COND:%[a-z0-9]+]] = icmp eq i32 %x, 123456
+; CHECK-NEXT: and i1 [[LOOP_COND]], [[IF_COND]]
+; CHECK-NEXT: br i1
+
+define amdgpu_kernel void @uniform_unswitch(i32 * nocapture %out, i32 %n, i32 %x) {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %cmp1 = icmp eq i32 %x, 123456
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.inc, %for.body.lr.ph
+  %i.07 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 * %out, i32 %i.07
+  store i32 %i.07, i32 * %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; Check that loop unswitch does not happen if condition is divergent.
+
+; CHECK-LABEL: {{^}}define amdgpu_kernel void @divergent_unswitch
+; CHECK: entry:
+; CHECK: icmp
+; CHECK: [[IF_COND:%[a-z0-9]+]] = icmp {{.*}} 567890
+; CHECK: br label
+; CHECK: br i1 [[IF_COND]]
+
+define amdgpu_kernel void @divergent_unswitch(i32 * nocapture %out, i32 %n) {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %call = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+  %cmp2 = icmp eq i32 %call, 567890
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.inc, %for.body.lr.ph
+  %i.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  br i1 %cmp2, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 * %out, i32 %i.010
+  store i32 %i.010, i32 * %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %inc = add nuw nsw i32 %i.010, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/Transforms/LoopUnswitch/AMDGPU/lit.local.cfg b/test/Transforms/LoopUnswitch/AMDGPU/lit.local.cfg
new file mode 100644
index 000000000000..2a665f06be72
--- /dev/null
+++ b/test/Transforms/LoopUnswitch/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True
diff --git a/test/Transforms/LoopUnswitch/basictest.ll b/test/Transforms/LoopUnswitch/basictest.ll
index a02a463764dd..3add848395ae 100644
--- a/test/Transforms/LoopUnswitch/basictest.ll
+++ b/test/Transforms/LoopUnswitch/basictest.ll
@@ -101,6 +101,217 @@ loop_exit:
 ; CHECK: }
 }
 
+; Make sure we unswitch %a == 0 out of the loop.
+;
+; CHECK: define void @and_i2_as_switch_input(i2
+; CHECK: entry:
+; This is an indication that the loop has been unswitched.
+; CHECK: icmp eq i2 %a, 0
+; CHECK: br
+; There should be no more unswitching after the 1st unswitch.
+; CHECK-NOT: icmp eq
+; CHECK: ret
+define void @and_i2_as_switch_input(i2 %a) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i2 [ 0, %entry ], [ %inc, %for.inc ]
+  %and = and i2 %a, %i
+  %and1 = and i2 %and, %i
+  switch i2 %and1, label %sw.default [
+    i2 0, label %sw.bb
+    i2 1, label %sw.bb1
+  ]
+
+sw.bb:
+  br label %sw.epilog
+
+sw.bb1:
+  br label %sw.epilog
+
+sw.default:
+  br label %sw.epilog
+
+sw.epilog:
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i2 %i, 1
+  %cmp = icmp slt i2 %inc, 3 
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Make sure we unswitch %a == !0 out of the loop.
+;
+; CHECK: define void @or_i2_as_switch_input(i2
+; CHECK: entry:
+; This is an indication that the loop has been unswitched.
+; CHECK: icmp eq i2 %a, -1
+; CHECK: br
+; There should be no more unswitching after the 1st unswitch.
+; CHECK-NOT: icmp eq
+; CHECK: ret
+define void @or_i2_as_switch_input(i2 %a) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i2 [ 0, %entry ], [ %inc, %for.inc ]
+  %or = or i2 %a, %i
+  %or1 = or i2 %or, %i
+  switch i2 %or1, label %sw.default [
+    i2 2, label %sw.bb
+    i2 3, label %sw.bb1
+  ]
+
+sw.bb:
+  br label %sw.epilog
+
+sw.bb1:
+  br label %sw.epilog
+
+sw.default:
+  br label %sw.epilog
+
+sw.epilog:
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i2 %i, 1
+  %cmp = icmp slt i2 %inc, 3 
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Make sure we unswitch %a == !0 out of the loop. Even we do not
+; have it as a case value. Unswitching it out allows us to simplify
+; the or operator chain.
+;
+; CHECK: define void @or_i2_as_switch_input_unswitch_default(i2
+; CHECK: entry:
+; This is an indication that the loop has been unswitched.
+; CHECK: icmp eq i2 %a, -1
+; CHECK: br
+; There should be no more unswitching after the 1st unswitch.
+; CHECK-NOT: icmp eq
+; CHECK: ret
+define void @or_i2_as_switch_input_unswitch_default(i2 %a) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i2 [ 0, %entry ], [ %inc, %for.inc ]
+  %or = or i2 %a, %i
+  %or1 = or i2 %or, %i
+  switch i2 %or1, label %sw.default [
+    i2 1, label %sw.bb
+    i2 2, label %sw.bb1
+  ]
+
+sw.bb:
+  br label %sw.epilog
+
+sw.bb1:
+  br label %sw.epilog
+
+sw.default:
+  br label %sw.epilog
+
+sw.epilog:
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i2 %i, 1
+  %cmp = icmp slt i2 %inc, 3 
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Make sure we don't unswitch, as we can not find an input value %a
+; that will effectively unswitch 0 or 3 out of the loop.
+;
+; CHECK: define void @and_or_i2_as_switch_input(i2
+; CHECK: entry:
+; This is an indication that the loop has NOT been unswitched.
+; CHECK-NOT: icmp
+; CHECK: br
+define void @and_or_i2_as_switch_input(i2 %a) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i2 [ 0, %entry ], [ %inc, %for.inc ]
+  %and = and i2 %a, %i 
+  %or = or i2 %and, %i
+  switch i2 %or, label %sw.default [
+    i2 0, label %sw.bb
+    i2 3, label %sw.bb1
+  ]
+
+sw.bb:
+  br label %sw.epilog
+
+sw.bb1:
+  br label %sw.epilog
+
+sw.default:
+  br label %sw.epilog
+
+sw.epilog:
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i2 %i, 1
+  %cmp = icmp slt i2 %inc, 3 
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Make sure we don't unswitch, as we can not find an input value %a
+; that will effectively unswitch true/false out of the loop.
+;
+; CHECK: define void @and_or_i1_as_branch_input(i1
+; CHECK: entry:
+; This is an indication that the loop has NOT been unswitched.
+; CHECK-NOT: icmp
+; CHECK: br
+define void @and_or_i1_as_branch_input(i1 %a) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i1 [ 0, %entry ], [ %inc, %for.inc ]
+  %and = and i1 %a, %i 
+  %or = or i1 %and, %i
+  br i1 %or, label %sw.bb, label %sw.bb1
+
+sw.bb:
+  br label %sw.epilog
+
+sw.bb1:
+  br label %sw.epilog
+
+sw.epilog:
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i1 %i, 1
+  %cmp = icmp slt i1 %inc, 1 
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
 
 declare void @incf() noreturn
 declare void @decf() noreturn
diff --git a/test/Transforms/LoopUnswitch/cold-loop.ll b/test/Transforms/LoopUnswitch/cold-loop.ll
deleted file mode 100644
index 1fbc08038bbd..000000000000
--- a/test/Transforms/LoopUnswitch/cold-loop.ll
+++ /dev/null
@@ -1,52 +0,0 @@
-; RUN: opt < %s -loop-unswitch -loop-unswitch-with-block-frequency -S 2>&1 | FileCheck %s
-
-;; trivial condition should be unswithed regardless of coldness.
-define i32 @test1(i1 %cond1, i1 %cond2) !prof !1 {
-  br i1 %cond1, label %loop_begin, label %loop_exit, !prof !0
-
-loop_begin:
-; CHECK: br i1 true, label %continue, label %loop_exit.loopexit
-  br i1 %cond2, label %continue, label %loop_exit  ; trivial condition
-
-continue:
-  call void @some_func1() noreturn nounwind
-  br label %loop_begin
-
-loop_exit:
-  ret i32 0
-}
-
-;; cold non-trivial condition should not be unswitched.
-define i32 @test2(i32* %var, i1 %cond1, i1 %cond2) !prof !1 {
-  br i1 %cond1, label %loop_begin, label %loop_exit, !prof !0
-
-loop_begin:
-  store i32 1, i32* %var
-; CHECK: br i1 %cond2, label %continue1, label %continue2
-  br i1 %cond2, label %continue1, label %continue2  ; non-trivial condition
-
-continue1:
-  call void @some_func1() noreturn nounwind
-  br label %joint
-
-continue2:
-  call void @some_func2() noreturn nounwind
-  br label %joint
-
-joint:
-;; unswitching will duplicate these calls.
-  call void @some_func3() noreturn nounwind
-  call void @some_func4() noreturn nounwind
-  br label %loop_begin
-
-loop_exit:
-  ret i32 0
-}
-
-declare void @some_func1() noreturn
-declare void @some_func2() noreturn
-declare void @some_func3() noreturn
-declare void @some_func4() noreturn
-
-!0 = !{!"branch_weights", i32 1, i32 100000000}
-!1 = !{!"function_entry_count", i64 100}
diff --git a/test/Transforms/LoopUnswitch/copy-metadata.ll b/test/Transforms/LoopUnswitch/copy-metadata.ll
index 2a634c25a23d..3302bce9a6e5 100644
--- a/test/Transforms/LoopUnswitch/copy-metadata.ll
+++ b/test/Transforms/LoopUnswitch/copy-metadata.ll
@@ -3,11 +3,11 @@
 ; This test checks if unswitched condition preserve make.implicit metadata.
 
 define i32 @test(i1 %cond) {
-; CHECK: br i1 %cond, label %..split_crit_edge, label %.loop_exit.split_crit_edge, !make.implicit !0
+; CHECK-LABEL: @test(
+; CHECK:  br i1 %cond, label %..split_crit_edge, label %.loop_exit.split_crit_edge, !make.implicit !0
   br label %loop_begin
 
 loop_begin:
-; CHECK: br i1 true, label %continue, label %loop_exit, !make.implicit !0
   br i1 %cond, label %continue, label %loop_exit, !make.implicit !0
 
 continue:
diff --git a/test/Transforms/LoopUnswitch/crash.ll b/test/Transforms/LoopUnswitch/crash.ll
index 101fb7a2c2ce..b273a123c39c 100644
--- a/test/Transforms/LoopUnswitch/crash.ll
+++ b/test/Transforms/LoopUnswitch/crash.ll
@@ -30,7 +30,7 @@ return:		; preds = %return.loopexit, %list_Length.exit9
 	ret void
 }
 
-define void @test2(i32 %x1, i32 %y1, i32 %z1, i32 %r1) nounwind {
+define void @test2() nounwind {
 entry:
   br label %bb.nph
 
diff --git a/test/Transforms/LoopUnswitch/simplify-with-nonvalness.ll b/test/Transforms/LoopUnswitch/simplify-with-nonvalness.ll
new file mode 100644
index 000000000000..d033b083a1b8
--- /dev/null
+++ b/test/Transforms/LoopUnswitch/simplify-with-nonvalness.ll
@@ -0,0 +1,58 @@
+; RUN: opt < %s -loop-unswitch -verify-loop-info -S < %s 2>&1 | FileCheck %s
+
+; There are 1 case and 1 default case in the switch. after we unswitch, we know the
+; %a is definitely not 0 in one of the unswitched loop, make sure we take advantage
+; of that and simplify the branches in the loop.
+;
+; CHECK: define void @simplify_with_nonvalness(
+
+; This is the loop in which we know %a is definitely 0.
+; CHECK: sw.bb.us:
+; CHECK: br i1 true, label %if.then.us, label %if.end.us
+
+; This is the loop in which we do not know what %a is but we know %a is definitely NOT 0.
+; Make sure we use that information to simplify.
+; The icmp eq i32 %a, 0 in one of the unswitched loop is simplified to false.
+; CHECK: sw.bb.split:
+; CHECK: br i1 false, label %if.then, label %if.end
+
+define void @simplify_with_nonvalness(i32 %a) #0 {
+entry:
+  br label %for.cond
+
+for.cond:
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  switch i32 %a, label %sw.default [
+    i32 0, label %sw.bb
+  ]
+
+sw.bb:
+  %cmp1 = icmp eq i32 %a, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  call void (...) @bar()
+  br label %if.end
+
+if.end:
+  br label %sw.epilog
+
+sw.default:
+  br label %sw.epilog
+
+sw.epilog:
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret void
+}
+
+declare void @bar(...) 
diff --git a/test/Transforms/LoopUnswitch/trivial-unswitch.ll b/test/Transforms/LoopUnswitch/trivial-unswitch.ll
index db3328278dae..2def5b6f0334 100644
--- a/test/Transforms/LoopUnswitch/trivial-unswitch.ll
+++ b/test/Transforms/LoopUnswitch/trivial-unswitch.ll
@@ -44,4 +44,48 @@ loop_exit:
   ret i32 0
 }
 
-declare void @some_func() noreturn
-\ No newline at end of file
+
+; We will not be able trivially unswitch on the SwitchInst, as its input
+; is a constant. However, since its a constant we should be able to figure
+; out that the switch can be folded into a unconditional branch to %continue.
+; Then we unswitch on the br inst in %continue.
+;
+; CHECK: define i32 @test2(
+; This is an indication that the loop has been unswitched on %cond1.
+; CHECK:  br i1 %cond1, label %..split_crit_edge, label %.loop_exit.split_crit_edge
+
+; CHECK:  ..split_crit_edge:                                ; preds = %0
+; CHECK:    br label %.split
+
+; CHECK:  .split:                                           ; preds = %..split_crit_edge
+; CHECK:    br label %loop_begin
+
+; CHECK:  loop_begin:                                       ; preds = %do_something, %.split
+; CHECK:    switch i32
+
+; CHECK:  continue:                                         ; preds = %loop_begin
+; CHECK:    %var_val = load i32, i32* %var
+; CHECK:    br i1 true, label %do_something, label %loop_exit
+
+define i32 @test2(i32* %var, i1 %cond1) {
+  br label %loop_begin
+
+loop_begin:  
+  switch i32 1, label %continue [
+    i32 0, label %loop_exit
+    i32 1, label %continue
+  ]
+
+continue:
+  %var_val = load i32, i32* %var
+  br i1 %cond1, label %do_something, label %loop_exit
+
+do_something:
+  call void @some_func() noreturn nounwind
+  br label %loop_begin
+
+loop_exit:
+  ret i32 0
+}
+
+declare void @some_func() noreturn
diff --git a/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
index 21b59f87d042..37a6d4e79984 100644
--- a/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
+++ b/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
@@ -1,40 +1,55 @@
-; RUN: opt < %s -loop-vectorize -simplifycfg -S | FileCheck %s
-; RUN: opt < %s -force-vector-width=2 -loop-vectorize -simplifycfg -S | FileCheck %s
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST
+; RUN: opt < %s -loop-vectorize -force-vector-width=2 -instcombine -simplifycfg -S | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
 
-; CHECK-LABEL: predicated_udiv_scalarized_operand
-;
 ; This test checks that we correctly compute the scalarized operands for a
 ; user-specified vectorization factor when interleaving is disabled. We use the
-; "optsize" attribute to disable all interleaving calculations.
+; "optsize" attribute to disable all interleaving calculations.  A cost of 4
+; for %tmp4 indicates that we would scalarize it's operand (%tmp3), giving
+; %tmp4 a lower scalarization overhead.
 ;
-; CHECK: vector.body:
-; CHECK:   %wide.load = load <2 x i64>, <2 x i64>* {{.*}}, align 4
-; CHECK:   br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]]
-; CHECK: [[IF0]]:
-; CHECK:   %[[T00:.+]] = extractelement <2 x i64> %wide.load, i32 0
-; CHECK:   %[[T01:.+]] = extractelement <2 x i64> %wide.load, i32 0
-; CHECK:   %[[T02:.+]] = add nsw i64 %[[T01]], %x
-; CHECK:   %[[T03:.+]] = udiv i64 %[[T00]], %[[T02]]
-; CHECK:   %[[T04:.+]] = insertelement <2 x i64> undef, i64 %[[T03]], i32 0
-; CHECK:   br label %[[CONT0]]
-; CHECK: [[CONT0]]:
-; CHECK:   %[[T05:.+]] = phi <2 x i64> [ undef, %vector.body ], [ %[[T04]], %[[IF0]] ]
-; CHECK:   br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]]
-; CHECK: [[IF1]]:
-; CHECK:   %[[T06:.+]] = extractelement <2 x i64> %wide.load, i32 1
-; CHECK:   %[[T07:.+]] = extractelement <2 x i64> %wide.load, i32 1
-; CHECK:   %[[T08:.+]] = add nsw i64 %[[T07]], %x
-; CHECK:   %[[T09:.+]] = udiv i64 %[[T06]], %[[T08]]
-; CHECK:   %[[T10:.+]] = insertelement <2 x i64> %[[T05]], i64 %[[T09]], i32 1
-; CHECK:   br label %[[CONT1]]
-; CHECK: [[CONT1]]:
-; CHECK:   phi <2 x i64> [ %[[T05]], %[[CONT0]] ], [ %[[T10]], %[[IF1]] ]
-; CHECK:   br i1 {{.*}}, label %middle.block, label %vector.body
-
-define i64 @predicated_udiv_scalarized_operand(i64* %a, i1 %c, i64 %x) optsize {
+; COST-LABEL:  predicated_udiv_scalarized_operand
+; COST:        LV: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3
+;
+; CHECK-LABEL: @predicated_udiv_scalarized_operand(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %entry ], [ [[INDEX_NEXT:%.*]], %[[PRED_UDIV_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, %entry ], [ [[TMP17:%.*]], %[[PRED_UDIV_CONTINUE2]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, i64* %a, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[PRED_UDIV_IF:.*]], label %[[PRED_UDIV_CONTINUE:.*]]
+; CHECK:       [[PRED_UDIV_IF]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw i64 [[TMP5]], %x
+; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> undef, i64 [[TMP7]], i32 0
+; CHECK-NEXT:    br label %[[PRED_UDIV_CONTINUE]]
+; CHECK:       [[PRED_UDIV_CONTINUE]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i64> [ undef, %vector.body ], [ [[TMP8]], %[[PRED_UDIV_IF]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_UDIV_IF1:.*]], label %[[PRED_UDIV_CONTINUE2]]
+; CHECK:       [[PRED_UDIV_IF1]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = add nsw i64 [[TMP12]], %x
+; CHECK-NEXT:    [[TMP14:%.*]] = udiv i64 [[TMP11]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP14]], i32 1
+; CHECK-NEXT:    br label %[[PRED_UDIV_CONTINUE2]]
+; CHECK:       [[PRED_UDIV_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x i64> [ [[TMP9]], %[[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], %[[PRED_UDIV_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP16]], <2 x i64> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP17]] = add <2 x i64> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i64 @predicated_udiv_scalarized_operand(i64* %a, i64 %x) optsize {
 entry:
   br label %for.body
 
@@ -43,7 +58,8 @@ for.body:
   %r = phi i64 [ 0, %entry ], [ %tmp6, %for.inc ]
   %tmp0 = getelementptr inbounds i64, i64* %a, i64 %i
   %tmp2 = load i64, i64* %tmp0, align 4
-  br i1 %c, label %if.then, label %for.inc
+  %cond0 = icmp sgt i64 %tmp2, 0
+  br i1 %cond0, label %if.then, label %for.inc
 
 if.then:
   %tmp3 = add nsw i64 %tmp2, %x
@@ -54,8 +70,8 @@ for.inc:
   %tmp5 = phi i64 [ %tmp2, %for.body ], [ %tmp4, %if.then]
   %tmp6 = add i64 %r, %tmp5
   %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp slt i64 %i.next, 100
-  br i1 %cond, label %for.body, label %for.end
+  %cond1 = icmp slt i64 %i.next, 100
+  br i1 %cond1, label %for.body, label %for.end
 
 for.end:
   %tmp7 = phi i64 [ %tmp6, %for.inc ]
diff --git a/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll b/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
deleted file mode 100644
index fc68adb59df3..000000000000
--- a/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
+++ /dev/null
@@ -1,341 +0,0 @@
-; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s
-; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -dce -instcombine -S | FileCheck %s --check-prefix=UNROLL
-; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S | FileCheck %s --check-prefix=UNROLL-NO-IC
-
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-
-; CHECK-LABEL: @recurrence_1
-;
-; void recurrence_1(int *a, int *b, int n) {
-;   for(int i = 0; i < n; i++)
-;     b[i] =  a[i] + a[i - 1]
-; }
-;
-; CHECK:  vector.ph:
-; CHECK:    %vector.recur.init = insertelement <4 x i32> undef, i32 %pre_load, i32 3
-;
-; CHECK:  vector.body:
-; CHECK:    %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
-; CHECK:    [[L1]] = load <4 x i32>
-; CHECK:    {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; CHECK:  middle.block:
-; CHECK:    %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
-;
-; CHECK:  scalar.ph:
-; CHECK:    %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %min.iters.checked ], [ %pre_load, %for.preheader ]
-;
-; CHECK:  scalar.body:
-; CHECK:    %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
-;
-; UNROLL: vector.body:
-; UNROLL:   %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
-; UNROLL:   [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32>
-; UNROLL:   [[L2]] = load <4 x i32>
-; UNROLL:   {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL:   {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; UNROLL: middle.block:
-; UNROLL:   %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3
-;
-define void @recurrence_1(i32* nocapture readonly %a, i32* nocapture %b, i32 %n) {
-entry:
-  br label %for.preheader
-
-for.preheader:
-  %arrayidx.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 0
-  %pre_load = load i32, i32* %arrayidx.phi.trans.insert
-  br label %scalar.body
-
-scalar.body:
-  %0 = phi i32 [ %pre_load, %for.preheader ], [ %1, %scalar.body ]
-  %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %arrayidx32 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
-  %1 = load i32, i32* %arrayidx32
-  %arrayidx34 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
-  %add35 = add i32 %1, %0
-  store i32 %add35, i32* %arrayidx34
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %n
-  br i1 %exitcond, label %for.exit, label %scalar.body
-
-for.exit:
-  ret void
-}
-
-; CHECK-LABEL: @recurrence_2
-;
-; int recurrence_2(int *a, int n) {
-;   int minmax;
-;   for (int i = 0; i < n; ++i)
-;     minmax = min(minmax, max(a[i] - a[i-1], 0));
-;   return minmax;
-; }
-;
-; CHECK:  vector.ph:
-; CHECK:    %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3
-;
-; CHECK:  vector.body:
-; CHECK:    %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
-; CHECK:    [[L1]] = load <4 x i32>
-; CHECK:    {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; CHECK:  middle.block:
-; CHECK:    %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
-;
-; CHECK:  scalar.ph:
-; CHECK:    %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %min.iters.checked ], [ %.pre, %for.preheader ]
-;
-; CHECK:  scalar.body:
-; CHECK:    %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
-;
-; UNROLL: vector.body:
-; UNROLL:   %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
-; UNROLL:   [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32>
-; UNROLL:   [[L2]] = load <4 x i32>
-; UNROLL:   {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL:   {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; UNROLL: middle.block:
-; UNROLL:   %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3
-;
-define i32 @recurrence_2(i32* nocapture readonly %a, i32 %n) {
-entry:
-  %cmp27 = icmp sgt i32 %n, 0
-  br i1 %cmp27, label %for.preheader, label %for.cond.cleanup
-
-for.preheader:
-  %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 -1
-  %.pre = load i32, i32* %arrayidx2.phi.trans.insert, align 4
-  br label %scalar.body
-
-for.cond.cleanup.loopexit:
-  %minmax.0.cond.lcssa = phi i32 [ %minmax.0.cond, %scalar.body ]
-  br label %for.cond.cleanup
-
-for.cond.cleanup:
-  %minmax.0.lcssa = phi i32 [ undef, %entry ], [ %minmax.0.cond.lcssa, %for.cond.cleanup.loopexit ]
-  ret i32 %minmax.0.lcssa
-
-scalar.body:
-  %0 = phi i32 [ %.pre, %for.preheader ], [ %1, %scalar.body ]
-  %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ]
-  %minmax.028 = phi i32 [ undef, %for.preheader ], [ %minmax.0.cond, %scalar.body ]
-  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-  %1 = load i32, i32* %arrayidx, align 4
-  %sub3 = sub nsw i32 %1, %0
-  %cmp4 = icmp sgt i32 %sub3, 0
-  %cond = select i1 %cmp4, i32 %sub3, i32 0
-  %cmp5 = icmp slt i32 %minmax.028, %cond
-  %minmax.0.cond = select i1 %cmp5, i32 %minmax.028, i32 %cond
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %n
-  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %scalar.body
-}
-
-; CHECK-LABEL: @recurrence_3
-;
-; void recurrence_3(short *a, double *b, int n, float f, short p) {
-;   b[0] = (double)a[0] - f * (double)p;
-;   for (int i = 1; i < n; i++)
-;     b[i] = (double)a[i] - f * (double)a[i - 1];
-; }
-;
-;
-; CHECK:  vector.ph:
-; CHECK:    %vector.recur.init = insertelement <4 x i16> undef, i16 %0, i32 3
-;
-; CHECK:  vector.body:
-; CHECK:    %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
-; CHECK:    [[L1]] = load <4 x i16>
-; CHECK:    {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; CHECK:  middle.block:
-; CHECK:    %vector.recur.extract = extractelement <4 x i16> [[L1]], i32 3
-;
-; CHECK:  scalar.ph:
-; CHECK:    %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %min.iters.checked ], [ %0, %for.preheader ]
-;
-; CHECK:  scalar.body:
-; CHECK:    %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
-;
-; UNROLL: vector.body:
-; UNROLL:   %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
-; UNROLL:   [[L1:%[a-zA-Z0-9.]+]] = load <4 x i16>
-; UNROLL:   [[L2]] = load <4 x i16>
-; UNROLL:   {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL:   {{.*}} = shufflevector <4 x i16> [[L1]], <4 x i16> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; UNROLL: middle.block:
-; UNROLL:   %vector.recur.extract = extractelement <4 x i16> [[L2]], i32 3
-;
-define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32 %n, float %f, i16 %p) {
-entry:
-  %0 = load i16, i16* %a, align 2
-  %conv = sitofp i16 %0 to double
-  %conv1 = fpext float %f to double
-  %conv2 = sitofp i16 %p to double
-  %mul = fmul fast double %conv2, %conv1
-  %sub = fsub fast double %conv, %mul
-  store double %sub, double* %b, align 8
-  %cmp25 = icmp sgt i32 %n, 1
-  br i1 %cmp25, label %for.preheader, label %for.end
-
-for.preheader:
-  br label %scalar.body
-
-scalar.body:
-  %1 = phi i16 [ %0, %for.preheader ], [ %2, %scalar.body ]
-  %advars.iv = phi i64 [ %advars.iv.next, %scalar.body ], [ 1, %for.preheader ]
-  %arrayidx5 = getelementptr inbounds i16, i16* %a, i64 %advars.iv
-  %2 = load i16, i16* %arrayidx5, align 2
-  %conv6 = sitofp i16 %2 to double
-  %conv11 = sitofp i16 %1 to double
-  %mul12 = fmul fast double %conv11, %conv1
-  %sub13 = fsub fast double %conv6, %mul12
-  %arrayidx15 = getelementptr inbounds double, double* %b, i64 %advars.iv
-  store double %sub13, double* %arrayidx15, align 8
-  %advars.iv.next = add nuw nsw i64 %advars.iv, 1
-  %lftr.wideiv = trunc i64 %advars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %n
-  br i1 %exitcond, label %for.end.loopexit, label %scalar.body
-
-for.end.loopexit:
-  br label %for.end
-
-for.end:
-  ret void
-}
-
-; CHECK-LABEL: @PR26734
-;
-; void PR26734(short *a, int *b, int *c, int d, short *e) {
-;   for (; d != 21; d++) {
-;     *b &= *c;
-;     *e = *a - 6;
-;     *c = *e;
-;   }
-; }
-;
-; CHECK-NOT: vector.ph:
-;
-define void @PR26734(i16* %a, i32* %b, i32* %c, i32 %d, i16* %e) {
-entry:
-  %cmp4 = icmp eq i32 %d, 21
-  br i1 %cmp4, label %entry.for.end_crit_edge, label %for.body.lr.ph
-
-entry.for.end_crit_edge:
-  %.pre = load i32, i32* %b, align 4
-  br label %for.end
-
-for.body.lr.ph:
-  %0 = load i16, i16* %a, align 2
-  %sub = add i16 %0, -6
-  %conv2 = sext i16 %sub to i32
-  %c.promoted = load i32, i32* %c, align 4
-  %b.promoted = load i32, i32* %b, align 4
-  br label %for.body
-
-for.body:
-  %inc7 = phi i32 [ %d, %for.body.lr.ph ], [ %inc, %for.body ]
-  %and6 = phi i32 [ %b.promoted, %for.body.lr.ph ], [ %and, %for.body ]
-  %conv25 = phi i32 [ %c.promoted, %for.body.lr.ph ], [ %conv2, %for.body ]
-  %and = and i32 %and6, %conv25
-  %inc = add nsw i32 %inc7, 1
-  %cmp = icmp eq i32 %inc, 21
-  br i1 %cmp, label %for.cond.for.end_crit_edge, label %for.body
-
-for.cond.for.end_crit_edge:
-  %and.lcssa = phi i32 [ %and, %for.body ]
-  store i32 %conv2, i32* %c, align 4
-  store i32 %and.lcssa, i32* %b, align 4
-  store i16 %sub, i16* %e, align 2
-  br label %for.end
-
-for.end:
-  ret void
-}
-
-; CHECK-LABEL: @PR27246
-;
-; int PR27246() {
-;   unsigned int e, n;
-;   for (int i = 1; i < 49; ++i) {
-;     for (int k = i; k > 1; --k)
-;       e = k;
-;     n = e;
-;   }
-;   return n;
-; }
-;
-; CHECK-NOT: vector.ph:
-;
-define i32 @PR27246() {
-entry:
-  br label %for.cond1.preheader
-
-for.cond1.preheader:
-  %i.016 = phi i32 [ 1, %entry ], [ %inc, %for.cond.cleanup3 ]
-  %e.015 = phi i32 [ undef, %entry ], [ %e.1.lcssa, %for.cond.cleanup3 ]
-  br label %for.cond1
-
-for.cond.cleanup:
-  %e.1.lcssa.lcssa = phi i32 [ %e.1.lcssa, %for.cond.cleanup3 ]
-  ret i32 %e.1.lcssa.lcssa
-
-for.cond1:
-  %e.1 = phi i32 [ %k.0, %for.cond1 ], [ %e.015, %for.cond1.preheader ]
-  %k.0 = phi i32 [ %dec, %for.cond1 ], [ %i.016, %for.cond1.preheader ]
-  %cmp2 = icmp sgt i32 %k.0, 1
-  %dec = add nsw i32 %k.0, -1
-  br i1 %cmp2, label %for.cond1, label %for.cond.cleanup3
-
-for.cond.cleanup3:
-  %e.1.lcssa = phi i32 [ %e.1, %for.cond1 ]
-  %inc = add nuw nsw i32 %i.016, 1
-  %exitcond = icmp eq i32 %inc, 49
-  br i1 %exitcond, label %for.cond.cleanup, label %for.cond1.preheader
-}
-
-; CHECK-LABEL: @PR29559
-;
-; UNROLL-NO-IC: vector.ph:
-; UNROLL-NO-IC:   br label %vector.body
-;
-; UNROLL-NO-IC: vector.body:
-; UNROLL-NO-IC:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; UNROLL-NO-IC:   %vector.recur = phi <4 x float*> [ undef, %vector.ph ], [ %[[I4:.+]], %vector.body ]
-; UNROLL-NO-IC:   %[[G1:.+]] = getelementptr inbounds [3 x float], [3 x float]* undef, i64 0, i64 0
-; UNROLL-NO-IC:   %[[I1:.+]] = insertelement <4 x float*> undef, float* %[[G1]], i32 0
-; UNROLL-NO-IC:   %[[I2:.+]] = insertelement <4 x float*> %[[I1]], float* %[[G1]], i32 1
-; UNROLL-NO-IC:   %[[I3:.+]] = insertelement <4 x float*> %[[I2]], float* %[[G1]], i32 2
-; UNROLL-NO-IC:   %[[I4]] = insertelement <4 x float*> %[[I3]], float* %[[G1]], i32 3
-; UNROLL-NO-IC:   {{.*}} = shufflevector <4 x float*> %vector.recur, <4 x float*> %[[I4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC:   {{.*}} = shufflevector <4 x float*> %[[I4]], <4 x float*> %[[I4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; UNROLL-NO-IC: middle.block:
-; UNROLL-NO-IC:   %vector.recur.extract = extractelement <4 x float*> %[[I4]], i32 3
-;
-; UNROLL-NO-IC: scalar.ph:
-; UNROLL-NO-IC:   %scalar.recur.init = phi float* [ %vector.recur.extract, %middle.block ], [ undef, %min.iters.checked ], [ undef, %entry ]
-;
-; UNROLL-NO-IC: scalar.body:
-; UNROLL-NO-IC:   %scalar.recur = phi float* [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
-;
-define void @PR29559() {
-entry:
-  br label %scalar.body
-
-scalar.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %scalar.body ]
-  %tmp2 = phi float* [ undef, %entry ], [ %tmp3, %scalar.body ]
-  %tmp3 = getelementptr inbounds [3 x float], [3 x float]* undef, i64 0, i64 0
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp eq i64 %i.next, undef
-  br i1 %cond, label %for.end, label %scalar.body
-
-for.end:
-  ret void
-}
diff --git a/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll b/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll
new file mode 100644
index 000000000000..e8ef42562356
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: @non_primary_iv_trunc_free(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDUCTION]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDUCTION1]] to i32
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @non_primary_iv_trunc_free(i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = trunc i64 %i to i32
+  %i.next = add nuw nsw i64 %i, 5
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll b/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
new file mode 100644
index 000000000000..0ebb7a92edae
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
@@ -0,0 +1,38 @@
+; REQUIRES: asserts
+; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -S --debug-only=loop-vectorize 2>&1 | FileCheck %s
+
+; This test shows extremely high interleaving cost that, probably, should be fixed.
+; Due to the high cost, interleaving is not beneficial and the cost model chooses to scalarize
+; the load instructions.
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+%pair = type { i8, i8 }
+
+; CHECK-LABEL: test
+; CHECK: Found an estimated cost of 20 for VF 2 For instruction:   {{.*}} load i8
+; CHECK: Found an estimated cost of 0 for VF 2 For instruction:   {{.*}} load i8
+; CHECK: vector.body
+; CHECK: load i8
+; CHECK: load i8
+; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
+
+define void @test(%pair* %p, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, %pair* %p, i64 %i, i32 0
+  %tmp1 = load i8, i8* %tmp0, align 1
+  %tmp2 = getelementptr %pair, %pair* %p, i64 %i, i32 1
+  %tmp3 = load i8, i8* %tmp2, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
index df1f9c619408..54ee8fc6e73f 100644
--- a/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
+++ b/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
@@ -1,81 +1,189 @@
-; RUN: opt -S -debug-only=loop-vectorize -loop-vectorize -instcombine < %s 2>&1 | FileCheck %s
+; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2
+; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4
+; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8
+; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnueabi"
 
-@AB = common global [1024 x i8] zeroinitializer, align 4
-@CD = common global [1024 x i8] zeroinitializer, align 4
+%i8.2 = type {i8, i8}
+define void @i8_factor_2(%i8.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_8-LABEL:  Checking a loop in "i8_factor_2"
+; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1
+; VF_16-LABEL: Checking a loop in "i8_factor_2"
+; VF_16:         Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 1
+  %tmp2 = load i8, i8* %tmp0, align 1
+  %tmp3 = load i8, i8* %tmp1, align 1
+  store i8 0, i8* %tmp0, align 1
+  store i8 0, i8* %tmp1, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%i16.2 = type {i16, i16}
+define void @i16_factor_2(%i16.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_4-LABEL: Checking a loop in "i16_factor_2"
+; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_8-LABEL:  Checking a loop in "i16_factor_2"
+; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_16-LABEL: Checking a loop in "i16_factor_2"
+; VF_16:         Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 1
+  %tmp2 = load i16, i16* %tmp0, align 2
+  %tmp3 = load i16, i16* %tmp1, align 2
+  store i16 0, i16* %tmp0, align 2
+  store i16 0, i16* %tmp1, align 2
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
 
-define void @test_byte_interleaved_cost(i8 %C, i8 %D) {
+for.end:
+  ret void
+}
+
+%i32.2 = type {i32, i32}
+define void @i32_factor_2(%i32.2* %data, i64 %n) {
 entry:
   br label %for.body
 
-; 8xi8 and 16xi8 are valid i8 vector types, so the cost of the interleaved
-; access group is 2.
-
-; CHECK: LV: Checking a loop in "test_byte_interleaved_cost"
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %tmp = load i8, i8* %arrayidx0, align 4
-; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction:   %tmp = load i8, i8* %arrayidx0, align 4
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %indvars.iv
-  %tmp = load i8, i8* %arrayidx0, align 4
-  %tmp1 = or i64 %indvars.iv, 1
-  %arrayidx1 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %tmp1
-  %tmp2 = load i8, i8* %arrayidx1, align 4
-  %add = add nsw i8 %tmp, %C
-  %mul = mul nsw i8 %tmp2, %D
-  %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %indvars.iv
-  store i8 %add, i8* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %tmp1
-  store i8 %mul, i8* %arrayidx3, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
-  %cmp = icmp slt i64 %indvars.iv.next, 1024
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body
+; VF_2-LABEL:  Checking a loop in "i32_factor_2"
+; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_4-LABEL:  Checking a loop in "i32_factor_2"
+; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_8-LABEL:  Checking a loop in "i32_factor_2"
+; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_16-LABEL: Checking a loop in "i32_factor_2"
+; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 1
+  %tmp2 = load i32, i32* %tmp0, align 4
+  %tmp3 = load i32, i32* %tmp1, align 4
+  store i32 0, i32* %tmp0, align 4
+  store i32 0, i32* %tmp1, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
   ret void
 }
 
-%ig.factor.8 = type { double*, double, double, double, double, double, double, double }
-define double @wide_interleaved_group(%ig.factor.8* %s, double %a, double %b, i32 %n) {
+%i64.2 = type {i64, i64}
+define void @i64_factor_2(%i64.2* %data, i64 %n) {
 entry:
   br label %for.body
 
-; Check the default cost of a strided load with a factor that is greater than
-; the maximum allowed. In this test, the interleave factor would be 8, which is
-; not supported.
+; VF_2-LABEL:  Checking a loop in "i64_factor_2"
+; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_4-LABEL:  Checking a loop in "i64_factor_2"
+; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 4 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_8-LABEL:  Checking a loop in "i64_factor_2"
+; VF_8:          Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 8 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_16-LABEL: Checking a loop in "i64_factor_2"
+; VF_16:         Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 16 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 1
+  %tmp2 = load i64, i64* %tmp0, align 8
+  %tmp3 = load i64, i64* %tmp1, align 8
+  store i64 0, i64* %tmp0, align 8
+  store i64 0, i64* %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
 
-; CHECK: LV: Checking a loop in "wide_interleaved_group"
-; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction:   %1 = load double, double* %0, align 8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %5 = load double, double* %4, align 8
-; CHECK: LV: Found an estimated cost of 10 for VF 2 For instruction:   store double %9, double* %10, align 8
+%i64.8 = type {i64, i64, i64, i64, i64, i64, i64, i64}
+define void @i64_factor_8(%i64.8* %data, i64 %n) {
+entry:
+  br label %for.body
 
+; The interleave factor in this test is 8, which is greater than the maximum
+; allowed factor for AArch64 (4). Thus, we will fall back to the basic TTI
+; implementation for determining the cost of the interleaved load group. The
+; stores do not form a legal interleaved group because the group would contain
+; gaps.
+;
+; VF_2-LABEL: Checking a loop in "i64_factor_8"
+; VF_2:         Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_2-NEXT:    Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_2-NEXT:    Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_2-NEXT:    Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %r = phi double [ 0.000000e+00, %entry ], [ %12, %for.body ]
-  %0 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 2
-  %1 = load double, double* %0, align 8
-  %2 = fcmp fast olt double %1, %a
-  %3 = select i1 %2, double 0.000000e+00, double %1
-  %4 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 6
-  %5 = load double, double* %4, align 8
-  %6 = fcmp fast olt double %5, %a
-  %7 = select i1 %6, double 0.000000e+00, double %5
-  %8 = fmul fast double %7, %b
-  %9 = fadd fast double %8, %3
-  %10 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 3
-  store double %9, double* %10, align 8
-  %11 = fmul fast double %9, %9
-  %12 = fadd fast double %11, %r
+  %tmp0 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 2
+  %tmp1 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 6
+  %tmp2 = load i64, i64* %tmp0, align 8
+  %tmp3 = load i64, i64* %tmp1, align 8
+  store i64 0, i64* %tmp0, align 8
+  store i64 0, i64* %tmp1, align 8
   %i.next = add nuw nsw i64 %i, 1
-  %13 = trunc i64 %i.next to i32
-  %cond = icmp eq i32 %13, %n
-  br i1 %cond, label %for.exit, label %for.body
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
 
-for.exit:
-  %r.lcssa = phi double [ %12, %for.body ]
-  ret double %r.lcssa
+for.end:
+  ret void
 }
diff --git a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
index c7ced757581a..d06e3fdba39c 100644
--- a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -234,12 +234,27 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
-; CHECK-LABEL: @add_phifail2(
-; CHECK: load <16 x i8>, <16 x i8>*
-; CHECK: add nuw nsw <16 x i32>
-; CHECK: store <16 x i8>
 ; Function Attrs: nounwind
+; When we vectorize this loop, we generate correct code
+; even when %len exactly divides VF (since we extract from the second last index
+; and pass this to the for.cond.cleanup block). Vectorized loop returns 
+; the correct value a_phi = p[len -2]
 define i8 @add_phifail2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
+; CHECK-LABEL: @add_phifail2(
+; CHECK: vector.body:
+; CHECK:   %wide.load = load <16 x i8>, <16 x i8>*
+; CHECK:   %[[L1:.+]] = zext <16 x i8> %wide.load to <16 x i32>
+; CHECK:   add nuw nsw <16 x i32>
+; CHECK:   store <16 x i8>
+; CHECK:   add i64 %index, 16
+; CHECK:   icmp eq i64 %index.next, %n.vec
+; CHECK: middle.block:
+; CHECK:   %vector.recur.extract = extractelement <16 x i32> %[[L1]], i32 15
+; CHECK:   %vector.recur.extract.for.phi = extractelement <16 x i32> %[[L1]], i32 14
+; CHECK: for.cond.cleanup:
+; CHECK:   %a_phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract.for.phi, %middle.block ]
+; CHECK:   %ret = trunc i32 %a_phi.lcssa to i8
+; CHECK:   ret i8 %ret
 entry:
   br label %for.body
 
diff --git a/test/Transforms/LoopVectorize/AArch64/pr31900.ll b/test/Transforms/LoopVectorize/AArch64/pr31900.ll
new file mode 100644
index 000000000000..5ea38a4a246d
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/pr31900.ll
@@ -0,0 +1,37 @@
+; RUN: opt -S -mtriple=aarch64-apple-ios -loop-vectorize -enable-interleaved-mem-accesses -force-vector-width=2 < %s | FileCheck %s
+
+; Reproducer for address space fault in the LoopVectorizer (pr31900). Added
+; different sized address space pointers (p:16:16-p4:32:16) to the aarch64
+; datalayout to reproduce the fault.
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128-p:16:16-p4:32:16"
+
+; Check that all the loads are scalarized
+; CHECK: load i16, i16*
+; CHECK: load i16, i16*
+; CHECK: load i16, i16 addrspace(4)*
+; CHECK: load i16, i16 addrspace(4)*
+
+%rec1445 = type { i16, i16, i16, i16, i16 }
+
+define void @foo() {
+bb1:
+  br label %bb4
+
+bb4:
+  %tmp1 = phi i16 [ undef, %bb1 ], [ %_tmp1013, %bb4 ]
+  %tmp2 = phi %rec1445* [ undef, %bb1 ], [ %_tmp1015, %bb4 ]
+  %tmp3 = phi %rec1445 addrspace(4)* [ undef, %bb1 ], [ %_tmp1017, %bb4 ]
+  %0 = getelementptr %rec1445, %rec1445* %tmp2, i16 0, i32 1
+  %_tmp987 = load i16, i16* %0, align 1
+  %1 = getelementptr %rec1445, %rec1445 addrspace(4)* %tmp3, i32 0, i32 1
+  %_tmp993 = load i16, i16 addrspace(4)* %1, align 1
+  %_tmp1013 = add i16 %tmp1, 1
+  %_tmp1015 = getelementptr %rec1445, %rec1445* %tmp2, i16 1
+  %_tmp1017 = getelementptr %rec1445, %rec1445 addrspace(4)* %tmp3, i32 1
+  %_tmp1019 = icmp ult i16 %_tmp1013, 24
+  br i1 %_tmp1019, label %bb4, label %bb16
+
+bb16:
+  unreachable
+}
diff --git a/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll b/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll
new file mode 100644
index 000000000000..1ae7dadeffd7
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll
@@ -0,0 +1,33 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: Checking a loop in "interleaved_access"
+; CHECK:         The Smallest and Widest types: 64 / 64 bits
+;
+define void @interleaved_access(i8** %A, i64 %N) {
+for.ph:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next.3, %for.body ], [ 0, %for.ph ]
+  %tmp0 = getelementptr inbounds i8*, i8** %A, i64 %i
+  store i8* null, i8** %tmp0, align 8
+  %i.next.0 = add nuw nsw i64 %i, 1
+  %tmp1 = getelementptr inbounds i8*, i8** %A, i64 %i.next.0
+  store i8* null, i8** %tmp1, align 8
+  %i.next.1 = add nsw i64 %i, 2
+  %tmp2 = getelementptr inbounds i8*, i8** %A, i64 %i.next.1
+  store i8* null, i8** %tmp2, align 8
+  %i.next.2 = add nsw i64 %i, 3
+  %tmp3 = getelementptr inbounds i8*, i8** %A, i64 %i.next.2
+  store i8* null, i8** %tmp3, align 8
+  %i.next.3 = add nsw i64 %i, 4
+  %cond = icmp slt i64 %i.next.3, %N
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/AMDGPU/lit.local.cfg b/test/Transforms/LoopVectorize/AMDGPU/lit.local.cfg
new file mode 100644
index 000000000000..2a665f06be72
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True
diff --git a/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll b/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll
new file mode 100644
index 000000000000..f303ed5377e2
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll
@@ -0,0 +1,28 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji -loop-vectorize < %s | FileCheck %s
+
+
+; For AMDGPU, loop unroll in loop vectorizer is disabled when VF==1.
+;
+; CHECK-LABEL: @small_loop(
+; CHECK: store i32
+; CHECK-NOT: store i32
+; CHECK: ret
+define amdgpu_kernel void @small_loop(i32* nocapture %inArray, i32 %size) nounwind {
+entry:
+  %0 = icmp sgt i32 %size, 0
+  br i1 %0, label %loop, label %exit
+
+loop:                                          ; preds = %entry, %loop
+  %iv = phi i32 [ %iv1, %loop ], [ 0, %entry ]
+  %1 = getelementptr inbounds i32, i32* %inArray, i32 %iv
+  %2 = load i32, i32* %1, align 4
+  %3 = add nsw i32 %2, 6
+  store i32 %3, i32* %1, align 4
+  %iv1 = add i32 %iv, 1
+;  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %cond = icmp eq i32 %iv1, %size
+  br i1 %cond, label %exit, label %loop
+
+exit:                                         ; preds = %loop, %entry
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll b/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
index de3626b57d83..29adec049f67 100644
--- a/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
+++ b/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
@@ -1,39 +1,147 @@
-; RUN: opt -S -debug-only=loop-vectorize -loop-vectorize -instcombine  < %s 2>&1 | FileCheck %s
+; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2
+; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4
+; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8
+; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "armv8--linux-gnueabihf"
 
-@AB = common global [1024 x i8] zeroinitializer, align 4
-@CD = common global [1024 x i8] zeroinitializer, align 4
+%i8.2 = type {i8, i8}
+define void @i8_factor_2(%i8.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_8-LABEL:  Checking a loop in "i8_factor_2"
+; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1
+; VF_16-LABEL: Checking a loop in "i8_factor_2"
+; VF_16:         Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 1
+  %tmp2 = load i8, i8* %tmp0, align 1
+  %tmp3 = load i8, i8* %tmp1, align 1
+  store i8 0, i8* %tmp0, align 1
+  store i8 0, i8* %tmp1, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
 
-define void @test_byte_interleaved_cost(i8 %C, i8 %D) {
+%i16.2 = type {i16, i16}
+define void @i16_factor_2(%i16.2* %data, i64 %n) {
 entry:
   br label %for.body
 
-; 8xi8 and 16xi8 are valid i8 vector types, so the cost of the interleaved
-; access group is 2.
-
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %tmp = load i8, i8* %arrayidx0, align 4
-; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction:   %tmp = load i8, i8* %arrayidx0, align 4
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %indvars.iv
-  %tmp = load i8, i8* %arrayidx0, align 4
-  %tmp1 = or i64 %indvars.iv, 1
-  %arrayidx1 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %tmp1
-  %tmp2 = load i8, i8* %arrayidx1, align 4
-  %add = add nsw i8 %tmp, %C
-  %mul = mul nsw i8 %tmp2, %D
-  %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %indvars.iv
-  store i8 %add, i8* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %tmp1
-  store i8 %mul, i8* %arrayidx3, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
-  %cmp = icmp slt i64 %indvars.iv.next, 1024
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body
+; VF_4-LABEL:  Checking a loop in "i16_factor_2"
+; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_8-LABEL:  Checking a loop in "i16_factor_2"
+; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_16-LABEL: Checking a loop in "i16_factor_2"
+; VF_16:         Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 1
+  %tmp2 = load i16, i16* %tmp0, align 2
+  %tmp3 = load i16, i16* %tmp1, align 2
+  store i16 0, i16* %tmp0, align 2
+  store i16 0, i16* %tmp1, align 2
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%i32.2 = type {i32, i32}
+define void @i32_factor_2(%i32.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "i32_factor_2"
+; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_4-LABEL:  Checking a loop in "i32_factor_2"
+; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_8-LABEL:  Checking a loop in "i32_factor_2"
+; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_16-LABEL: Checking a loop in "i32_factor_2"
+; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 1
+  %tmp2 = load i32, i32* %tmp0, align 4
+  %tmp3 = load i32, i32* %tmp1, align 4
+  store i32 0, i32* %tmp0, align 4
+  store i32 0, i32* %tmp1, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%half.2 = type {half, half}
+define void @half_factor_2(%half.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_4-LABEL: Checking a loop in "half_factor_2"
+; VF_4:         Found an estimated cost of 40 for VF 4 For instruction: %tmp2 = load half, half* %tmp0, align 2
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, half* %tmp1, align 2
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_4-NEXT:    Found an estimated cost of 32 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_8-LABEL: Checking a loop in "half_factor_2"
+; VF_8:         Found an estimated cost of 80 for VF 8 For instruction: %tmp2 = load half, half* %tmp0, align 2
+; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, half* %tmp1, align 2
+; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_8-NEXT:    Found an estimated cost of 64 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 1
+  %tmp2 = load half, half* %tmp0, align 2
+  %tmp3 = load half, half* %tmp1, align 2
+  store half 0., half* %tmp0, align 2
+  store half 0., half* %tmp1, align 2
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
   ret void
 }
diff --git a/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll b/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll
new file mode 100644
index 000000000000..d2e594520332
--- /dev/null
+++ b/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll
@@ -0,0 +1,38 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -force-vector-width=2 -debug-only=loop-vectorize \
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s
+
+; Check costs for branches inside a vectorized loop around predicated
+; blocks. Each such branch will be guarded with an extractelement from the
+; vector compare plus a test under mask instruction. This cost is modelled on
+; the extractelement of i1.
+
+define void @fun(i32* %arr, i64 %trip.count) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv
+  %l = load i32, i32* %arrayidx, align 4
+  %cmp55 = icmp sgt i32 %l, 0
+  br i1 %cmp55, label %if.then, label %for.inc
+
+if.then:
+  %sub = sub nsw i32 0, %l
+  store i32 %sub, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %trip.count
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  ret void
+
+; CHECK: LV: Found an estimated cost of 5 for VF 2 For instruction:   br i1 %cmp55, label %if.then, label %for.inc
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br label %for.inc
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br i1 %exitcond, label %for.end.loopexit, label %for.body
+}
diff --git a/test/Transforms/LoopVectorize/SystemZ/lit.local.cfg b/test/Transforms/LoopVectorize/SystemZ/lit.local.cfg
new file mode 100644
index 000000000000..2f3cf7d3f043
--- /dev/null
+++ b/test/Transforms/LoopVectorize/SystemZ/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'SystemZ' in config.root.targets:
+    config.unsupported = True
diff --git a/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll b/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll
new file mode 100644
index 000000000000..e7096c29b994
--- /dev/null
+++ b/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll
@@ -0,0 +1,33 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -force-vector-width=4 -debug-only=loop-vectorize \
+; RUN:   -disable-output -enable-interleaved-mem-accesses=false < %s 2>&1 | \
+; RUN:   FileCheck %s
+;
+; Check that a scalarized load/store does not get a cost for insterts/
+; extracts, since z13 supports element load/store.
+
+define void @fun(i32* %data, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds i32, i32* %data, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  %tmp2 = add i32 %tmp1, 1
+  store i32 %tmp2, i32* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   store i32 %tmp2, i32* %tmp0, align 4
+
+; CHECK: LV: Scalarizing:  %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK: LV: Scalarizing:  store i32 %tmp2, i32* %tmp0, align 4
+}
+
diff --git a/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll b/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll
new file mode 100644
index 000000000000..5c15ee4f2d9f
--- /dev/null
+++ b/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll
@@ -0,0 +1,70 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -force-vector-width=4 -debug-only=loop-vectorize \
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s
+;
+; Check that the loop vectorizer performs memory interleaving with accurate
+; cost estimations.
+
+
+; Simple case where just the load is interleaved, because the store group
+; would have gaps.
+define void @fun0(i32* %data, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds i32, i32* %data, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  %tmp2 = add i32 %tmp1, 1
+  store i32 %tmp2, i32* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+
+; CHECK: LV: Creating an interleave group with:  %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK: LV: Found an estimated cost of 3 for VF 4 For instruction:   %tmp1 = load i32, i32* %tmp0, align 4
+;        (vl; vl; vperm)
+}
+
+; Interleaving of both load and stores.
+define void @fun1(i32* %data, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds i32, i32* %data, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  %i_1  = add i64 %i, 1
+  %tmp2 = getelementptr inbounds i32, i32* %data, i64 %i_1
+  %tmp3 = load i32, i32* %tmp2, align 4
+  store i32 %tmp1, i32* %tmp2, align 4
+  store i32 %tmp3, i32* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+
+; CHECK: LV: Creating an interleave group with:  store i32 %tmp3, i32* %tmp0, align 4
+; CHECK: LV: Inserted:  store i32 %tmp1, i32* %tmp2, align 4
+; CHECK:     into the interleave group with  store i32 %tmp3, i32* %tmp0, align 4
+; CHECK: LV: Creating an interleave group with:  %tmp3 = load i32, i32* %tmp2, align 4
+; CHECK: LV: Inserted:  %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK:     into the interleave group with  %tmp3 = load i32, i32* %tmp2, align 4
+
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %tmp3 = load i32, i32* %tmp2, align 4
+;            (vl; vl; vperm, vpkg)
+
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   store i32 %tmp1, i32* %tmp2, align 4
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   store i32 %tmp3, i32* %tmp0, align 4
+;            (vmrlf; vmrhf; vst; vst)
+}
+
diff --git a/test/Transforms/LoopVectorize/X86/avx512.ll b/test/Transforms/LoopVectorize/X86/avx512.ll
index fb01454c253b..1eb1cd3f5d7a 100644
--- a/test/Transforms/LoopVectorize/X86/avx512.ll
+++ b/test/Transforms/LoopVectorize/X86/avx512.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-apple-macosx10.9.0"
 ; loop.
 
 ; CHECK-LABEL: f:
-; CHECK: vmovups %zmm{{.}},
+; CHECK: vmovdqu32 %zmm{{.}},
 ; CHECK-NOT: %ymm
 
 define void @f(i32* %a, i32 %n) {
diff --git a/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
index 32bfcd2275ac..82f2e064a581 100644
--- a/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
+++ b/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@@ -13,22 +13,33 @@ target triple = "x86_64-unknown-linux-gnu"
 ; scatter operation. %tmp3 (and the induction variable) should not be marked
 ; uniform-after-vectorization.
 ;
-; CHECK:     LV: Found uniform instruction: %tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i
-; CHECK-NOT: LV: Found uniform instruction: %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i
-; CHECK-NOT: LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
-; CHECK-NOT: LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5
-; CHECK:     vector.body:
-; CHECK:       %vec.ind = phi <16 x i64>
-; CHECK:       %[[T0:.+]] = extractelement <16 x i64> %vec.ind, i32 0
-; CHECK:       %[[T1:.+]] = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %[[T0]]
-; CHECK:       %[[T2:.+]] = bitcast float* %[[T1]] to <80 x float>*
-; CHECK:       load <80 x float>, <80 x float>* %[[T2]], align 4
-; CHECK:       %[[T3:.+]] = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %[[T0]]
-; CHECK:       %[[T4:.+]] = bitcast float* %[[T3]] to <80 x float>*
-; CHECK:       load <80 x float>, <80 x float>* %[[T4]], align 4
-; CHECK:       %VectorGep = getelementptr inbounds %data, %data* %d, i64 0, i32 0, <16 x i64> %vec.ind
-; CHECK:       call void @llvm.masked.scatter.v16f32({{.*}}, <16 x float*> %VectorGep, {{.*}})
-; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+; CHECK:       LV: Found uniform instruction: %tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i
+; CHECK-NOT:   LV: Found uniform instruction: %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i
+; CHECK-NOT:   LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+; CHECK-NOT:   LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x float> undef, float %x, i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x float> [[BROADCAST_SPLATINSERT]], <16 x float> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label %vector.body
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 5, i64 10, i64 15, i64 20, i64 25, i64 30, i64 35, i64 40, i64 45, i64 50, i64 55, i64 60, i64 65, i64 70, i64 75>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to <80 x float>*
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <80 x float>, <80 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <80 x float> [[WIDE_VEC]], <80 x float> undef, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <16 x float> [[BROADCAST_SPLAT]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds %data, %data* %d, i64 0, i32 0, <16 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <16 x float*> [[TMP3]] to <16 x <80 x float>*>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x <80 x float>*> [[BC]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <80 x float>, <80 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <80 x float> [[WIDE_VEC1]], <80 x float> undef, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <16 x float> [[STRIDED_VEC2]], [[TMP2]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16f32(<16 x float> [[TMP5]], <16 x float*> [[TMP3]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
 
 %data = type { [32000 x float], [3 x i32], [4 x i8], [32000 x float] }
 
diff --git a/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll b/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll
new file mode 100644
index 000000000000..76b6cae5c3b4
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll
@@ -0,0 +1,41 @@
+; RUN: opt -loop-vectorize -S -mcpu=skylake-avx512  < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; This test checks that "gather" operation is choosen since it's cost is better
+; than interleaving pattern.
+;
+;unsigned long A[SIZE];
+;unsigned long B[SIZE];
+;
+;void foo() {
+;  for (int i=0; i<N; i+=8) {
+;    B[i] = A[i] + 5;
+;  }
+;}
+
+@A = global [10240 x i64] zeroinitializer, align 16
+@B = global [10240 x i64] zeroinitializer, align 16
+
+
+; CHECK_LABEL: strided_load_i64
+; CHECK: masked.gather
+define void @strided_load_i64() {
+  br label %1
+
+; <label>:1:                                      ; preds = %0, %1
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %indvars.iv
+  %3 = load i64, i64* %2, align 16
+  %4 = add i64 %3, 5
+  %5 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
+  store i64 %4, i64* %5, align 16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8
+  %6 = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %6, label %1, label %7
+
+; <label>:7:                                      ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/X86/int128_no_gather.ll b/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
index fbea275cb40f..4d7c0b6f64b8 100644
--- a/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
+++ b/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
@@ -71,6 +71,6 @@ declare i32 @printf(i8*, ...) #1
 ; Function Attrs: nounwind
 declare i32 @puts(i8* nocapture readonly) #2
 
-attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pcommit,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pcommit,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind }
diff --git a/test/Transforms/LoopVectorize/X86/interleaving.ll b/test/Transforms/LoopVectorize/X86/interleaving.ll
index de5db5324381..9294c92b5759 100644
--- a/test/Transforms/LoopVectorize/X86/interleaving.ll
+++ b/test/Transforms/LoopVectorize/X86/interleaving.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine < %s | FileCheck %s --check-prefix=NORMAL
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=slm < %s | FileCheck %s --check-prefix=NORMAL
 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=atom < %s | FileCheck %s --check-prefix=ATOM
 
 ; NORMAL-LABEL: foo
diff --git a/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/test/Transforms/LoopVectorize/X86/metadata-enable.ll
index 74c0c16086fe..e1793bcc3218 100644
--- a/test/Transforms/LoopVectorize/X86/metadata-enable.ll
+++ b/test/Transforms/LoopVectorize/X86/metadata-enable.ll
@@ -1,13 +1,14 @@
 ; RUN: opt < %s -mcpu=corei7 -O1 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1
 ; RUN: opt < %s -mcpu=corei7 -O2 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O2
-; RUN: opt < %s -mcpu=corei7 -O3 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3
+; RUN: opt < %s -mcpu=corei7 -O3 -S -unroll-threshold=150 -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3
+; RUN: opt < %s -mcpu=corei7 -O3 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3DEFAULT
 ; RUN: opt < %s -mcpu=corei7 -Os -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Os
 ; RUN: opt < %s -mcpu=corei7 -Oz -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Oz
 ; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC
 ; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC
 ; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC2
 ; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC2
-; RUN: opt < %s -mcpu=corei7 -O3 -disable-loop-vectorization -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3DIS
+; RUN: opt < %s -mcpu=corei7 -O3 -unroll-threshold=150 -disable-loop-vectorization -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3DIS
 
 ; This file tests the llvm.loop.vectorize.enable metadata forcing
 ; vectorization even when optimization levels are too low, or when
@@ -25,6 +26,9 @@ target triple = "x86_64-unknown-linux-gnu"
 ; O3-LABEL: @enabled(
 ; O3: store <4 x i32>
 ; O3: ret i32
+; O3DEFAULT-LABEL: @enabled(
+; O3DEFAULT: store <4 x i32>
+; O3DEFAULT: ret i32
 ; Pragma always wins!
 ; O3DIS-LABEL: @enabled(
 ; O3DIS: store <4 x i32>
@@ -77,6 +81,9 @@ for.end:                                          ; preds = %for.body
 ; O3-LABEL: @nopragma(
 ; O3: store <4 x i32>
 ; O3: ret i32
+; O3DEFAULT-LABEL: @nopragma(
+; O3DEFAULT: store <4 x i32>
+; O3DEFAULT: ret i32
 ; O3DIS-LABEL: @nopragma(
 ; O3DIS-NOT: store <4 x i32>
 ; O3DIS: ret i32
@@ -128,6 +135,9 @@ for.end:                                          ; preds = %for.body
 ; O3-LABEL: @disabled(
 ; O3-NOT: store <4 x i32>
 ; O3: ret i32
+; O3DEFAULT-LABEL: @disabled(
+; O3DEFAULT: store <4 x i32>
+; O3DEFAULT: ret i32
 ; O3DIS-LABEL: @disabled(
 ; O3DIS-NOT: store <4 x i32>
 ; O3DIS: ret i32
diff --git a/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index ec67e632efbd..bda4b2454ee2 100755
--- a/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -16,97 +16,23 @@ target triple = "x86_64-apple-macosx10.11.0"
 define void @_Z3fn1v() #0 {
 ; CHECK-LABEL: @_Z3fn1v(
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX:%.*]].next, %vector.body ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ 
-; CHECK-NEXT:    [[VEC_IND3:%.*]] = phi <16 x i64> [ 
-; CHECK-NEXT:    [[SHL:%.*]] = shl i64 %index, 1
-; CHECK-NEXT:    %offset.idx = add i64 [[SHL]], 8
-; CHECK-NEXT:    [[IND00:%.*]] = add i64 %offset.idx, 0
-; CHECK-NEXT:    [[IND02:%.*]] = add i64 %offset.idx, 2
-; CHECK-NEXT:    [[IND04:%.*]] = add i64 %offset.idx, 4
-; CHECK-NEXT:    [[IND06:%.*]] = add i64 %offset.idx, 6
-; CHECK-NEXT:    [[IND08:%.*]] = add i64 %offset.idx, 8
-; CHECK-NEXT:    [[IND10:%.*]] = add i64 %offset.idx, 10
-; CHECK-NEXT:    [[IND12:%.*]] = add i64 %offset.idx, 12
-; CHECK-NEXT:    [[IND14:%.*]] = add i64 %offset.idx, 14
-; CHECK-NEXT:    [[IND16:%.*]] = add i64 %offset.idx, 16
-; CHECK-NEXT:    [[IND18:%.*]] = add i64 %offset.idx, 18
-; CHECK-NEXT:    [[IND20:%.*]] = add i64 %offset.idx, 20
-; CHECK-NEXT:    [[IND22:%.*]] = add i64 %offset.idx, 22
-; CHECK-NEXT:    [[IND24:%.*]] = add i64 %offset.idx, 24
-; CHECK-NEXT:    [[IND26:%.*]] = add i64 %offset.idx, 26
-; CHECK-NEXT:    [[IND28:%.*]] = add i64 %offset.idx, 28
-; CHECK-NEXT:    [[IND30:%.*]] = add i64 %offset.idx, 30
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND3:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, %vector.ph ], [ [[VEC_IND_NEXT4:%.*]], %vector.body ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <16 x i64> <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>, [[VEC_IND]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND00]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND02]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND04]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND06]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND08]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND10]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND12]]
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND14]]
-; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND16]]
-; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND18]]
-; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND20]]
-; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND22]]
-; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND24]]
-; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND26]]
-; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND28]]
-; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND30]]
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x [10 x i32]*> undef, [10 x i32]* [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x [10 x i32]*> [[TMP13]], [10 x i32]* [[TMP15]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x [10 x i32]*> [[TMP16]], [10 x i32]* [[TMP18]], i32 2
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x [10 x i32]*> [[TMP19]], [10 x i32]* [[TMP21]], i32 3
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x [10 x i32]*> [[TMP22]], [10 x i32]* [[TMP24]], i32 4
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x [10 x i32]*> [[TMP25]], [10 x i32]* [[TMP27]], i32 5
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x [10 x i32]*> [[TMP28]], [10 x i32]* [[TMP30]], i32 6
-; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <16 x [10 x i32]*> [[TMP31]], [10 x i32]* [[TMP33]], i32 7
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <16 x [10 x i32]*> [[TMP34]], [10 x i32]* [[TMP36]], i32 8
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <16 x [10 x i32]*> [[TMP37]], [10 x i32]* [[TMP39]], i32 9
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <16 x [10 x i32]*> [[TMP40]], [10 x i32]* [[TMP42]], i32 10
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <16 x [10 x i32]*> [[TMP43]], [10 x i32]* [[TMP45]], i32 11
-; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <16 x [10 x i32]*> [[TMP46]], [10 x i32]* [[TMP48]], i32 12
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <16 x [10 x i32]*> [[TMP49]], [10 x i32]* [[TMP51]], i32 13
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <16 x [10 x i32]*> [[TMP52]], [10 x i32]* [[TMP54]], i32 14
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <16 x [10 x i32]*> [[TMP55]], [10 x i32]* [[TMP57]], i32 15
-; CHECK-NEXT:    [[TMP59:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]]
-; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <16 x i64> [[TMP59]], i32 0
-; CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP12]], i64 [[TMP61]], i64 0
-; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <16 x i64> [[TMP59]], i32 1
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP15]], i64 [[TMP65]], i64 0
-; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <16 x i64> [[TMP59]], i32 2
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP18]], i64 [[TMP69]], i64 0
-; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <16 x i64> [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP21]], i64 [[TMP73]], i64 0
-; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <16 x i64> [[TMP59]], i32 4
-; CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP24]], i64 [[TMP77]], i64 0
-; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <16 x i64> [[TMP59]], i32 5
-; CHECK-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP27]], i64 [[TMP81]], i64 0
-; CHECK-NEXT:    [[TMP85:%.*]] = extractelement <16 x i64> [[TMP59]], i32 6
-; CHECK-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP30]], i64 [[TMP85]], i64 0
-; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <16 x i64> [[TMP59]], i32 7
-; CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP33]], i64 [[TMP89]], i64 0
-; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <16 x i64> [[TMP59]], i32 8
-; CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP36]], i64 [[TMP93]], i64 0
-; CHECK-NEXT:    [[TMP97:%.*]] = extractelement <16 x i64> [[TMP59]], i32 9
-; CHECK-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP39]], i64 [[TMP97]], i64 0
-; CHECK-NEXT:    [[TMP101:%.*]] = extractelement <16 x i64> [[TMP59]], i32 10
-; CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP42]], i64 [[TMP101]], i64 0
-; CHECK-NEXT:    [[TMP105:%.*]] = extractelement <16 x i64> [[TMP59]], i32 11
-; CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP45]], i64 [[TMP105]], i64 0
-; CHECK-NEXT:    [[TMP109:%.*]] = extractelement <16 x i64> [[TMP59]], i32 12
-; CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP48]], i64 [[TMP109]], i64 0
-; CHECK-NEXT:    [[TMP113:%.*]] = extractelement <16 x i64> [[TMP59]], i32 13
-; CHECK-NEXT:    [[TMP114:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP51]], i64 [[TMP113]], i64 0
-; CHECK-NEXT:    [[TMP117:%.*]] = extractelement <16 x i64> [[TMP59]], i32 14
-; CHECK-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP54]], i64 [[TMP117]], i64 0
-; CHECK-NEXT:    [[TMP121:%.*]] = extractelement <16 x i64> [[TMP59]], i32 15
-; CHECK-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP57]], i64 [[TMP121]], i64 0
-; CHECK-NEXT:    [[VECTORGEP:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP58]], <16 x i64> [[TMP59]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[VECTORGEP]], i32 16, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-; CHECK:         [[STEP_ADD:%.*]] = add <16 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
-; CHECK:         [[STEP_ADD4:%.*]] = add <16 x i64> [[VEC_IND3]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, <16 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP12]], i64 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP13]], i32 16, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i64> [[VEC_IND3]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <16 x i64> [[TMP10]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP15]], i64 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP16]], i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[VEC_IND_NEXT4]] = add <16 x i64> [[VEC_IND3]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
 entry:
   %0 = load i32, i32* @c, align 4
   %cmp34 = icmp sgt i32 %0, 8
diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
index f28e6be23529..b2933c4b56f2 100644
--- a/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
+++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
@@ -1,4 +1,6 @@
 ; RUN: opt < %s -loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -o /dev/null -pass-remarks-output=%t.yaml
+; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
 
 ; C/C++ code for tests
 ; void test(int *A, int Length) {
@@ -42,6 +44,61 @@
 ; CHECK-NOT: x i32>
 ; CHECK: ret
 
+; YAML:       --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            CantComputeNumberOfIterations
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 4, Column: 5 }
+; YAML-NEXT: Function:        _Z4testPii
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          could not determine number of loop iterations
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Missed
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            MissedDetails
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 4, Column: 5 }
+; YAML-NEXT: Function:        _Z4testPii
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          loop not vectorized
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            AllDisabled
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 13, Column: 5 }
+; YAML-NEXT: Function:        _Z13test_disabledPii
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: vectorization and interleaving are explicitly disabled, or vectorize width and interleave count are both set to 1'
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            ''
+; YAML-NEXT: Name:            CantIdentifyArrayBounds
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 19, Column: 5 }
+; YAML-NEXT: Function:        _Z17test_array_boundsPiS_i
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          cannot identify array bounds
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Missed
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            MissedDetails
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 19, Column: 5 }
+; YAML-NEXT: Function:        _Z17test_array_boundsPiS_i
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          loop not vectorized
+; YAML-NEXT:   - String:          ' (Force='
+; YAML-NEXT:   - Force:           'true'
+; YAML-NEXT:   - String:          ')'
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Failure
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            FailedRequestedVectorization
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 19, Column: 5 }
+; YAML-NEXT: Function:        _Z17test_array_boundsPiS_i
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          failed explicitly specified loop vectorization
+; YAML-NEXT: ...
+
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind optsize ssp uwtable
diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
index fc9f97328fb7..91466e65078f 100644
--- a/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
+++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-vectorize -pass-remarks-analysis='loop-vectorize' -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -pass-remarks-missed='loop-vectorize' -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s
 
 ; Verify analysis remarks are generated when interleaving is not beneficial.
 ; CHECK: remark: vectorization-remarks-profitable.c:5:17: the cost-model indicates that vectorization is not beneficial
diff --git a/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll b/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
index 88b2aa36b08c..125829090c3f 100644
--- a/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
+++ b/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
@@ -438,3 +438,53 @@ for.end:
   %tmp5 = phi i32 [ %tmp2, %for.body ]
   ret i32 %tmp5
 }
+
+; INTER-LABEL: bitcast_pointer_operand
+;
+; Check that a pointer operand having a user other than a memory access is
+; recognized as uniform after vectorization. In this test case, %tmp1 is a
+; bitcast that is used by a load and a getelementptr instruction (%tmp2). Once
+; %tmp2 is marked uniform, %tmp1 should be marked uniform as well.
+;
+; INTER:       LV: Found uniform instruction: %cond = icmp slt i64 %i.next, %n
+; INTER-NEXT:  LV: Found uniform instruction: %tmp2 = getelementptr inbounds i8, i8* %tmp1, i64 3
+; INTER-NEXT:  LV: Found uniform instruction: %tmp6 = getelementptr inbounds i8, i8* %B, i64 %i
+; INTER-NEXT:  LV: Found uniform instruction: %tmp1 = bitcast i64* %tmp0 to i8*
+; INTER-NEXT:  LV: Found uniform instruction: %tmp0 = getelementptr inbounds i64, i64* %A, i64 %i
+; INTER-NEXT:  LV: Found uniform instruction: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+; INTER-NEXT:  LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 1
+; INTER:       vector.body:
+; INTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; INTER-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, i64* %A, i64 [[INDEX]]
+; INTER-NEXT:    [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <32 x i8>*
+; INTER-NEXT:    [[WIDE_VEC:%.*]] = load <32 x i8>, <32 x i8>* [[TMP5]], align 1
+; INTER-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
+; INTER-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
+; INTER-NEXT:    [[TMP6:%.*]] = xor <4 x i8> [[STRIDED_VEC5]], [[STRIDED_VEC]]
+; INTER-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* %B, i64 [[INDEX]]
+; INTER-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>*
+; INTER-NEXT:    store <4 x i8> [[TMP6]], <4 x i8>* [[TMP8]], align 1
+; INTER-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; INTER:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @bitcast_pointer_operand(i64* %A, i8* %B, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds i64, i64* %A, i64 %i
+  %tmp1 = bitcast i64* %tmp0 to i8*
+  %tmp2 = getelementptr inbounds i8, i8* %tmp1, i64 3
+  %tmp3 = load i8, i8* %tmp2, align 1
+  %tmp4 = load i8, i8* %tmp1, align 1
+  %tmp5 = xor i8 %tmp3, %tmp4
+  %tmp6 = getelementptr inbounds i8, i8* %B, i64 %i
+  store i8 %tmp5, i8* %tmp6
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/discriminator.ll b/test/Transforms/LoopVectorize/discriminator.ll
new file mode 100644
index 000000000000..b7d34582dbd8
--- /dev/null
+++ b/test/Transforms/LoopVectorize/discriminator.ll
@@ -0,0 +1,70 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck --check-prefix=LOOPVEC_4_1 %s
+; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-interleave=3 < %s | FileCheck --check-prefix=LOOPVEC_2_3 %s
+; RUN: opt -S -loop-unroll  -unroll-count=5 < %s | FileCheck --check-prefix=LOOPUNROLL_5 %s
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=4 -loop-unroll -unroll-count=2 < %s | FileCheck --check-prefix=LOOPVEC_UNROLL %s
+
+; Test if vectorization/unroll factor is recorded in discriminator.
+;
+; Original source code:
+;  1 int *a;
+;  2 int *b;
+;  3 
+;  4 void foo() {
+;  5   for (int i = 0; i < 4096; i++)
+;  6     a[i] += b[i];
+;  7 }
+
+@a = local_unnamed_addr global i32* null, align 8
+@b = local_unnamed_addr global i32* null, align 8
+
+define void @_Z3foov() local_unnamed_addr #0 !dbg !6 {
+  %1 = load i32*, i32** @b, align 8, !dbg !8, !tbaa !9
+  %2 = load i32*, i32** @a, align 8, !dbg !13, !tbaa !9
+  br label %3, !dbg !14
+
+; <label>:3:                                      ; preds = %3, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %3 ]
+  %4 = getelementptr inbounds i32, i32* %1, i64 %indvars.iv, !dbg !8
+  %5 = load i32, i32* %4, align 4, !dbg !8, !tbaa !15
+  %6 = getelementptr inbounds i32, i32* %2, i64 %indvars.iv, !dbg !13
+  %7 = load i32, i32* %6, align 4, !dbg !17, !tbaa !15
+  %8 = add nsw i32 %7, %5, !dbg !17
+  store i32 %8, i32* %6, align 4, !dbg !17, !tbaa !15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !18
+  %exitcond = icmp eq i64 %indvars.iv.next, 4096, !dbg !19
+  br i1 %exitcond, label %9, label %3, !dbg !14, !llvm.loop !20
+
+; <label>:9:                                      ; preds = %3
+  ret void, !dbg !21
+}
+
+;LOOPVEC_4_1: discriminator: 17
+;LOOPVEC_2_3: discriminator: 25
+;LOOPUNROLL_5: discriminator: 21
+; When unrolling after loop vectorize, both vec_body and remainder loop
+; are unrolled.
+;LOOPVEC_UNROLL: discriminator: 385
+;LOOPVEC_UNROLL: discriminator: 9
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, debugInfoForProfiling: true)
+!1 = !DIFile(filename: "a.cc", directory: "/")
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 4, unit: !0)
+!8 = !DILocation(line: 6, column: 13, scope: !6)
+!9 = !{!10, !10, i64 0}
+!10 = !{!"any pointer", !11, i64 0}
+!11 = !{!"omnipotent char", !12, i64 0}
+!12 = !{!"Simple C++ TBAA"}
+!13 = !DILocation(line: 6, column: 5, scope: !6)
+!14 = !DILocation(line: 5, column: 3, scope: !6)
+!15 = !{!16, !16, i64 0}
+!16 = !{!"int", !11, i64 0}
+!17 = !DILocation(line: 6, column: 10, scope: !6)
+!18 = !DILocation(line: 5, column: 30, scope: !6)
+!19 = !DILocation(line: 5, column: 21, scope: !6)
+!20 = distinct !{!20, !14}
+!21 = !DILocation(line: 7, column: 1, scope: !6)
diff --git a/test/Transforms/LoopVectorize/first-order-recurrence.ll b/test/Transforms/LoopVectorize/first-order-recurrence.ll
new file mode 100644
index 000000000000..3d1c78038e32
--- /dev/null
+++ b/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -0,0 +1,398 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -dce -instcombine -S | FileCheck %s --check-prefix=UNROLL
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S | FileCheck %s --check-prefix=UNROLL-NO-IC
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S | FileCheck %s --check-prefix=UNROLL-NO-VF
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; void recurrence_1(int *a, int *b, int n) {
+;   for(int i = 0; i < n; i++)
+;     b[i] =  a[i] + a[i - 1]
+; }
+;
+; CHECK-LABEL: @recurrence_1(
+; CHECK:       vector.ph:
+; CHECK:         %vector.recur.init = insertelement <4 x i32> undef, i32 %pre_load, i32 3
+; CHECK:       vector.body:
+; CHECK:         %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
+; CHECK:         [[L1]] = load <4 x i32>
+; CHECK:         {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK:       middle.block:
+; CHECK:         %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
+; CHECK:       scalar.ph:
+; CHECK:         %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %min.iters.checked ], [ %pre_load, %for.preheader ]
+; CHECK:       scalar.body:
+; CHECK:         %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
+;
+; UNROLL-LABEL: @recurrence_1(
+; UNROLL:       vector.body:
+; UNROLL:         %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
+; UNROLL:         [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32>
+; UNROLL:         [[L2]] = load <4 x i32>
+; UNROLL:         {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:         {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:       middle.block:
+; UNROLL:         %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3
+;
+define void @recurrence_1(i32* nocapture readonly %a, i32* nocapture %b, i32 %n) {
+entry:
+  br label %for.preheader
+
+for.preheader:
+  %arrayidx.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 0
+  %pre_load = load i32, i32* %arrayidx.phi.trans.insert
+  br label %scalar.body
+
+scalar.body:
+  %0 = phi i32 [ %pre_load, %for.preheader ], [ %1, %scalar.body ]
+  %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx32 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
+  %1 = load i32, i32* %arrayidx32
+  %arrayidx34 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %add35 = add i32 %1, %0
+  store i32 %add35, i32* %arrayidx34
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.exit, label %scalar.body
+
+for.exit:
+  ret void
+}
+
+; int recurrence_2(int *a, int n) {
+;   int minmax;
+;   for (int i = 0; i < n; ++i)
+;     minmax = min(minmax, max(a[i] - a[i-1], 0));
+;   return minmax;
+; }
+;
+; CHECK-LABEL: @recurrence_2(
+; CHECK:       vector.ph:
+; CHECK:         %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3
+; CHECK:       vector.body:
+; CHECK:         %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
+; CHECK:         [[L1]] = load <4 x i32>
+; CHECK:         {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK:       middle.block:
+; CHECK:         %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
+; CHECK:       scalar.ph:
+; CHECK:         %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %min.iters.checked ], [ %.pre, %for.preheader ]
+; CHECK:       scalar.body:
+; CHECK:         %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
+;
+; UNROLL-LABEL: @recurrence_2(
+; UNROLL:       vector.body:
+; UNROLL:         %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
+; UNROLL:         [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32>
+; UNROLL:         [[L2]] = load <4 x i32>
+; UNROLL:         {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:         {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:       middle.block:
+; UNROLL:         %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3
+;
+define i32 @recurrence_2(i32* nocapture readonly %a, i32 %n) {
+entry:
+  %cmp27 = icmp sgt i32 %n, 0
+  br i1 %cmp27, label %for.preheader, label %for.cond.cleanup
+
+for.preheader:
+  %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 -1
+  %.pre = load i32, i32* %arrayidx2.phi.trans.insert, align 4
+  br label %scalar.body
+
+for.cond.cleanup.loopexit:
+  %minmax.0.cond.lcssa = phi i32 [ %minmax.0.cond, %scalar.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %minmax.0.lcssa = phi i32 [ undef, %entry ], [ %minmax.0.cond.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %minmax.0.lcssa
+
+scalar.body:
+  %0 = phi i32 [ %.pre, %for.preheader ], [ %1, %scalar.body ]
+  %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ]
+  %minmax.028 = phi i32 [ undef, %for.preheader ], [ %minmax.0.cond, %scalar.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4
+  %sub3 = sub nsw i32 %1, %0
+  %cmp4 = icmp sgt i32 %sub3, 0
+  %cond = select i1 %cmp4, i32 %sub3, i32 0
+  %cmp5 = icmp slt i32 %minmax.028, %cond
+  %minmax.0.cond = select i1 %cmp5, i32 %minmax.028, i32 %cond
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %scalar.body
+}
+
+; void recurrence_3(short *a, double *b, int n, float f, short p) {
+;   b[0] = (double)a[0] - f * (double)p;
+;   for (int i = 1; i < n; i++)
+;     b[i] = (double)a[i] - f * (double)a[i - 1];
+; }
+;
+; CHECK-LABEL: @recurrence_3(
+; CHECK:       vector.ph:
+; CHECK:         %vector.recur.init = insertelement <4 x i16> undef, i16 %0, i32 3
+; CHECK:       vector.body:
+; CHECK:         %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
+; CHECK:         [[L1]] = load <4 x i16>
+; CHECK:         {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK:       middle.block:
+; CHECK:         %vector.recur.extract = extractelement <4 x i16> [[L1]], i32 3
+; CHECK:       scalar.ph:
+; CHECK:         %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %min.iters.checked ], [ %0, %for.preheader ]
+; CHECK:       scalar.body:
+; CHECK:         %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
+;
+; UNROLL-LABEL: @recurrence_3(
+; UNROLL:       vector.body:
+; UNROLL:         %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
+; UNROLL:         [[L1:%[a-zA-Z0-9.]+]] = load <4 x i16>
+; UNROLL:         [[L2]] = load <4 x i16>
+; UNROLL:         {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:         {{.*}} = shufflevector <4 x i16> [[L1]], <4 x i16> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:       middle.block:
+; UNROLL:         %vector.recur.extract = extractelement <4 x i16> [[L2]], i32 3
+;
+define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32 %n, float %f, i16 %p) {
+entry:
+  %0 = load i16, i16* %a, align 2
+  %conv = sitofp i16 %0 to double
+  %conv1 = fpext float %f to double
+  %conv2 = sitofp i16 %p to double
+  %mul = fmul fast double %conv2, %conv1
+  %sub = fsub fast double %conv, %mul
+  store double %sub, double* %b, align 8
+  %cmp25 = icmp sgt i32 %n, 1
+  br i1 %cmp25, label %for.preheader, label %for.end
+
+for.preheader:
+  br label %scalar.body
+
+scalar.body:
+  %1 = phi i16 [ %0, %for.preheader ], [ %2, %scalar.body ]
+  %advars.iv = phi i64 [ %advars.iv.next, %scalar.body ], [ 1, %for.preheader ]
+  %arrayidx5 = getelementptr inbounds i16, i16* %a, i64 %advars.iv
+  %2 = load i16, i16* %arrayidx5, align 2
+  %conv6 = sitofp i16 %2 to double
+  %conv11 = sitofp i16 %1 to double
+  %mul12 = fmul fast double %conv11, %conv1
+  %sub13 = fsub fast double %conv6, %mul12
+  %arrayidx15 = getelementptr inbounds double, double* %b, i64 %advars.iv
+  store double %sub13, double* %arrayidx15, align 8
+  %advars.iv.next = add nuw nsw i64 %advars.iv, 1
+  %lftr.wideiv = trunc i64 %advars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end.loopexit, label %scalar.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; void PR26734(short *a, int *b, int *c, int d, short *e) {
+;   for (; d != 21; d++) {
+;     *b &= *c;
+;     *e = *a - 6;
+;     *c = *e;
+;   }
+; }
+;
+; CHECK-LABEL: @PR26734(
+; CHECK-NOT:   vector.ph:
+; CHECK:       }
+;
+define void @PR26734(i16* %a, i32* %b, i32* %c, i32 %d, i16* %e) {
+entry:
+  %cmp4 = icmp eq i32 %d, 21
+  br i1 %cmp4, label %entry.for.end_crit_edge, label %for.body.lr.ph
+
+entry.for.end_crit_edge:
+  %.pre = load i32, i32* %b, align 4
+  br label %for.end
+
+for.body.lr.ph:
+  %0 = load i16, i16* %a, align 2
+  %sub = add i16 %0, -6
+  %conv2 = sext i16 %sub to i32
+  %c.promoted = load i32, i32* %c, align 4
+  %b.promoted = load i32, i32* %b, align 4
+  br label %for.body
+
+for.body:
+  %inc7 = phi i32 [ %d, %for.body.lr.ph ], [ %inc, %for.body ]
+  %and6 = phi i32 [ %b.promoted, %for.body.lr.ph ], [ %and, %for.body ]
+  %conv25 = phi i32 [ %c.promoted, %for.body.lr.ph ], [ %conv2, %for.body ]
+  %and = and i32 %and6, %conv25
+  %inc = add nsw i32 %inc7, 1
+  %cmp = icmp eq i32 %inc, 21
+  br i1 %cmp, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %and.lcssa = phi i32 [ %and, %for.body ]
+  store i32 %conv2, i32* %c, align 4
+  store i32 %and.lcssa, i32* %b, align 4
+  store i16 %sub, i16* %e, align 2
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; int PR27246() {
+;   unsigned int e, n;
+;   for (int i = 1; i < 49; ++i) {
+;     for (int k = i; k > 1; --k)
+;       e = k;
+;     n = e;
+;   }
+;   return n;
+; }
+;
+; CHECK-LABEL: @PR27246(
+; CHECK-NOT:   vector.ph:
+; CHECK:       }
+;
+define i32 @PR27246() {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %i.016 = phi i32 [ 1, %entry ], [ %inc, %for.cond.cleanup3 ]
+  %e.015 = phi i32 [ undef, %entry ], [ %e.1.lcssa, %for.cond.cleanup3 ]
+  br label %for.cond1
+
+for.cond.cleanup:
+  %e.1.lcssa.lcssa = phi i32 [ %e.1.lcssa, %for.cond.cleanup3 ]
+  ret i32 %e.1.lcssa.lcssa
+
+for.cond1:
+  %e.1 = phi i32 [ %k.0, %for.cond1 ], [ %e.015, %for.cond1.preheader ]
+  %k.0 = phi i32 [ %dec, %for.cond1 ], [ %i.016, %for.cond1.preheader ]
+  %cmp2 = icmp sgt i32 %k.0, 1
+  %dec = add nsw i32 %k.0, -1
+  br i1 %cmp2, label %for.cond1, label %for.cond.cleanup3
+
+for.cond.cleanup3:
+  %e.1.lcssa = phi i32 [ %e.1, %for.cond1 ]
+  %inc = add nuw nsw i32 %i.016, 1
+  %exitcond = icmp eq i32 %inc, 49
+  br i1 %exitcond, label %for.cond.cleanup, label %for.cond1.preheader
+}
+
+; UNROLL-NO-IC-LABEL: @PR30183(
+; UNROLL-NO-IC:       vector.ph:
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> undef, i32 [[PRE_LOAD:%.*]], i32 3
+; UNROLL-NO-IC-NEXT:    br label %vector.body
+; UNROLL-NO-IC:       vector.body:
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], %vector.ph ], [ [[TMP42:%.*]], %vector.body ]
+; UNROLL-NO-IC:         [[TMP27:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP32:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP34:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP35:%.*]] = insertelement <4 x i32> undef, i32 [[TMP27]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP36:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP28]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP36]], i32 [[TMP29]], i32 2
+; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = insertelement <4 x i32> [[TMP37]], i32 [[TMP30]], i32 3
+; UNROLL-NO-IC-NEXT:    [[TMP39:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP40:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP32]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> [[TMP40]], i32 [[TMP33]], i32 2
+; UNROLL-NO-IC-NEXT:    [[TMP42]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP34]], i32 3
+; UNROLL-NO-IC-NEXT:    [[TMP43:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP38]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i32> [[TMP38]], <4 x i32> [[TMP42]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; UNROLL-NO-IC:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @PR30183(i32 %pre_load, i32* %a, i32* %b, i64 %n) {
+entry:
+  br label %scalar.body
+
+scalar.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %scalar.body ]
+  %tmp0 = phi i32 [ %pre_load, %entry ], [ %tmp2, %scalar.body ]
+  %i.next = add nuw nsw i64 %i, 2
+  %tmp1 = getelementptr inbounds i32, i32* %a, i64 %i.next
+  %tmp2 = load i32, i32* %tmp1
+  %cond = icmp eq i64 %i.next,%n
+  br i1 %cond, label %for.end, label %scalar.body
+
+for.end:
+  ret void
+}
+
+; UNROLL-NO-IC-LABEL: @constant_folded_previous_value(
+; UNROLL-NO-IC:       vector.body:
+; UNROLL-NO-IC:         [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 undef, i64 undef, i64 undef, i64 0>, %vector.ph ], [ <i64 1, i64 1, i64 1, i64 1>, %vector.body ]
+; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @constant_folded_previous_value() {
+entry:
+  br label %scalar.body
+
+scalar.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %scalar.body ]
+  %tmp2 = phi i64 [ 0, %entry ], [ %tmp3, %scalar.body ]
+  %tmp3 = add i64 0, 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, undef
+  br i1 %cond, label %for.end, label %scalar.body
+
+for.end:
+  ret void
+}
+
+; We vectorize this first order recurrence, by generating two
+; extracts for the phi `val.phi` - one at the last index and 
+; another at the second last index. We need these 2 extracts because 
+; the first order recurrence phi is used outside the loop, so we require the phi
+; itself and not its update (addx).
+; UNROLL-NO-IC-LABEL: extract_second_last_iteration
+; UNROLL-NO-IC: vector.body
+; UNROLL-NO-IC:   %step.add = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
+; UNROLL-NO-IC:   %[[L1:.+]] = add <4 x i32> %vec.ind, %broadcast.splat
+; UNROLL-NO-IC:   %[[L2:.+]] = add <4 x i32> %step.add, %broadcast.splat
+; UNROLL-NO-IC:   %index.next = add i32 %index, 8
+; UNROLL-NO-IC:   icmp eq i32 %index.next, 96
+; UNROLL-NO-IC: middle.block
+; UNROLL-NO-IC:   icmp eq i32 96, 96
+; UNROLL-NO-IC:   %vector.recur.extract = extractelement <4 x i32> %[[L2]], i32 3
+; UNROLL-NO-IC:   %vector.recur.extract.for.phi = extractelement <4 x i32> %[[L2]], i32 2
+; UNROLL-NO-IC: for.end
+; UNROLL-NO-IC:   %val.phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract.for.phi, %middle.block ]
+; Check the case when unrolled but not vectorized.
+; UNROLL-NO-VF-LABEL: extract_second_last_iteration
+; UNROLL-NO-VF: vector.body:
+; UNROLL-NO-VF:   %induction = add i32 %index, 0
+; UNROLL-NO-VF:   %induction1 = add i32 %index, 1
+; UNROLL-NO-VF:   %[[L1:.+]] = add i32 %induction, %x
+; UNROLL-NO-VF:   %[[L2:.+]] = add i32 %induction1, %x
+; UNROLL-NO-VF:   %index.next = add i32 %index, 2
+; UNROLL-NO-VF:   icmp eq i32 %index.next, 96
+; UNROLL-NO-VF: for.end:
+; UNROLL-NO-VF:   %val.phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %[[L1]], %middle.block ]
+define i32 @extract_second_last_iteration(i32* %cval, i32 %x)  {
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %val.phi = phi i32 [ 0, %entry ], [ %addx, %for.body ]
+  %inc = add i32 %inc.phi, 1
+  %bc = zext i32 %inc.phi to i64
+  %addx = add i32 %inc.phi, %x
+  %cmp = icmp eq i32 %inc.phi, 95
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %val.phi
+}
diff --git a/test/Transforms/LoopVectorize/float-induction.ll b/test/Transforms/LoopVectorize/float-induction.ll
index 79bddf471c26..8eec6e262c1a 100644
--- a/test/Transforms/LoopVectorize/float-induction.ll
+++ b/test/Transforms/LoopVectorize/float-induction.ll
@@ -1,43 +1,7 @@
 ; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck --check-prefix VEC4_INTERL1 %s
 ; RUN: opt < %s  -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -dce -instcombine -S | FileCheck --check-prefix VEC4_INTERL2 %s
 ; RUN: opt < %s  -loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -dce -instcombine -S | FileCheck --check-prefix VEC1_INTERL2 %s
-
-; VEC4_INTERL1-LABEL: @fp_iv_loop1(
-; VEC4_INTERL1:       %[[FP_INC:.*]] = load float, float* @fp_inc
-; VEC4_INTERL1: vector.body:
-; VEC4_INTERL1:       %[[FP_INDEX:.*]] = sitofp i64 {{.*}} to float
-; VEC4_INTERL1:       %[[VEC_INCR:.*]] = fmul fast float {{.*}}, %[[FP_INDEX]]
-; VEC4_INTERL1:       %[[FP_OFFSET_IDX:.*]] = fsub fast float %init, %[[VEC_INCR]]
-; VEC4_INTERL1:       %[[BRCT_INSERT:.*]] = insertelement <4 x float> undef, float %[[FP_OFFSET_IDX]], i32 0
-; VEC4_INTERL1-NEXT:  %[[BRCT_SPLAT:.*]] = shufflevector <4 x float> %[[BRCT_INSERT]], <4 x float> undef, <4 x i32> zeroinitializer
-; VEC4_INTERL1:       %[[BRCT_INSERT:.*]] = insertelement {{.*}} %[[FP_INC]]
-; VEC4_INTERL1-NEXT:  %[[FP_INC_BCST:.*]] = shufflevector <4 x float> %[[BRCT_INSERT]], {{.*}} zeroinitializer
-; VEC4_INTERL1:       %[[VSTEP:.*]] = fmul fast <4 x float> %[[FP_INC_BCST]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; VEC4_INTERL1-NEXT:  %[[VEC_INDUCTION:.*]] = fsub fast <4 x float> %[[BRCT_SPLAT]], %[[VSTEP]]
-; VEC4_INTERL1:       store <4 x float> %[[VEC_INDUCTION]]
-
-; VEC4_INTERL2-LABEL: @fp_iv_loop1(
-; VEC4_INTERL2:       %[[FP_INC:.*]] = load float, float* @fp_inc
-; VEC4_INTERL2: vector.body:
-; VEC4_INTERL2:       %[[INDEX:.*]] = sitofp i64 {{.*}} to float
-; VEC4_INTERL2:       %[[VEC_INCR:.*]] = fmul fast float %{{.*}}, %[[INDEX]]
-; VEC4_INTERL2:       fsub fast float %init, %[[VEC_INCR]]
-; VEC4_INTERL2:       %[[VSTEP1:.*]] = fmul fast <4 x float> %{{.*}}, <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; VEC4_INTERL2-NEXT:  %[[VEC_INDUCTION1:.*]] = fsub fast <4 x float> {{.*}}, %[[VSTEP1]]
-; VEC4_INTERL2:       %[[VSTEP2:.*]] = fmul fast <4 x float> %{{.*}}, <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>
-; VEC4_INTERL2-NEXT:  %[[VEC_INDUCTION2:.*]] = fsub fast <4 x float> {{.*}}, %[[VSTEP2]]
-; VEC4_INTERL2:       store <4 x float> %[[VEC_INDUCTION1]]
-; VEC4_INTERL2:       store <4 x float> %[[VEC_INDUCTION2]]
-
-; VEC1_INTERL2-LABEL: @fp_iv_loop1(
-; VEC1_INTERL2:       %[[FP_INC:.*]] = load float, float* @fp_inc
-; VEC1_INTERL2: vector.body:
-; VEC1_INTERL2:         %[[INDEX:.*]] = sitofp i64 {{.*}} to float
-; VEC1_INTERL2:         %[[STEP:.*]] = fmul fast float %{{.*}}, %[[INDEX]]
-; VEC1_INTERL2:         %[[FP_OFFSET_IDX:.*]] = fsub fast float %init, %[[STEP]]
-; VEC1_INTERL2:         %[[SCALAR_INDUCTION2:.*]] = fsub fast float %[[FP_OFFSET_IDX]], %[[FP_INC]]
-; VEC1_INTERL2:         store float %[[FP_OFFSET_IDX]]
-; VEC1_INTERL2:         store float %[[SCALAR_INDUCTION2]]
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -dce -simplifycfg -instcombine -S | FileCheck --check-prefix VEC2_INTERL1_PRED_STORE %s
 
 @fp_inc = common global float 0.000000e+00, align 4
 
@@ -49,6 +13,71 @@
 ;  }
 ;}
 
+; VEC4_INTERL1-LABEL: @fp_iv_loop1(
+; VEC4_INTERL1:       vector.ph:
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[DOTSPLAT3]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; VEC4_INTERL1-NEXT:    [[INDUCTION4:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP5]]
+; VEC4_INTERL1-NEXT:    [[TMP6:%.*]] = fmul fast float %fpinc, 4.000000e+00
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <4 x float> undef, float [[TMP6]], i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT5]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    br label %vector.body
+; VEC4_INTERL1:       vector.body:
+; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION4]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[VEC_IND]], <4 x float>* [[TMP9]], align 4
+; VEC4_INTERL1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT]] = fsub fast <4 x float> [[VEC_IND]], [[DOTSPLAT6]]
+; VEC4_INTERL1:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+; VEC4_INTERL2-LABEL: @fp_iv_loop1(
+; VEC4_INTERL2:       vector.ph:
+; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL2-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0
+; VEC4_INTERL2-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT3]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL2-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[DOTSPLAT4]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; VEC4_INTERL2-NEXT:    [[INDUCTION5:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP5]]
+; VEC4_INTERL2-NEXT:    [[TMP6:%.*]] = fmul fast float %fpinc, 4.000000e+00
+; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT6:%.*]] = insertelement <4 x float> undef, float [[TMP6]], i32 0
+; VEC4_INTERL2-NEXT:    [[DOTSPLAT7:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT6]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL2-NEXT:    br label %vector.body
+; VEC4_INTERL2:       vector.body:
+; VEC4_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL2-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION5]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL2-NEXT:    [[STEP_ADD:%.*]] = fsub fast <4 x float> [[VEC_IND]], [[DOTSPLAT7]]
+; VEC4_INTERL2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC4_INTERL2-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>*
+; VEC4_INTERL2-NEXT:    store <4 x float> [[VEC_IND]], <4 x float>* [[TMP10]], align 4
+; VEC4_INTERL2-NEXT:    [[TMP11:%.*]] = getelementptr float, float* [[TMP9]], i64 4
+; VEC4_INTERL2-NEXT:    [[TMP12:%.*]] = bitcast float* [[TMP11]] to <4 x float>*
+; VEC4_INTERL2-NEXT:    store <4 x float> [[STEP_ADD]], <4 x float>* [[TMP12]], align 4
+; VEC4_INTERL2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; VEC4_INTERL2-NEXT:    [[VEC_IND_NEXT]] = fsub fast <4 x float> [[STEP_ADD]], [[DOTSPLAT7]]
+; VEC4_INTERL2:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+; VEC1_INTERL2-LABEL: @fp_iv_loop1(
+; VEC1_INTERL2:       vector.ph:
+; VEC1_INTERL2-NEXT:    br label %vector.body
+; VEC1_INTERL2:       vector.body:
+; VEC1_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC1_INTERL2-NEXT:    [[INDUCTION2:%.*]] = or i64 [[INDEX]], 1
+; VEC1_INTERL2-NEXT:    [[TMP6:%.*]] = sitofp i64 [[INDEX]] to float
+; VEC1_INTERL2-NEXT:    [[TMP7:%.*]] = fmul fast float %fpinc, [[TMP6]]
+; VEC1_INTERL2-NEXT:    [[FP_OFFSET_IDX:%.*]] = fsub fast float %init, [[TMP7]]
+; VEC1_INTERL2-NEXT:    [[TMP8:%.*]] = fsub fast float [[FP_OFFSET_IDX]], %fpinc
+; VEC1_INTERL2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC1_INTERL2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDUCTION2]]
+; VEC1_INTERL2-NEXT:    store float [[FP_OFFSET_IDX]], float* [[TMP9]], align 4
+; VEC1_INTERL2-NEXT:    store float [[TMP8]], float* [[TMP10]], align 4
+; VEC1_INTERL2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; VEC1_INTERL2:         br i1 {{.*}}, label %middle.block, label %vector.body
+
 define void @fp_iv_loop1(float %init, float* noalias nocapture %A, i32 %N) #1 {
 entry:
   %cmp4 = icmp sgt i32 %N, 0
@@ -85,15 +114,20 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ;}
 
 ; VEC4_INTERL1-LABEL: @fp_iv_loop2(
-; VEC4_INTERL1: vector.body
-; VEC4_INTERL1:  %[[index:.*]] = phi i64 [ 0, %vector.ph ]
-; VEC4_INTERL1: sitofp i64 %[[index]] to float
-; VEC4_INTERL1: %[[VAR1:.*]] = fmul fast float {{.*}}, 5.000000e-01
-; VEC4_INTERL1: %[[VAR2:.*]] = fadd fast float %[[VAR1]]
-; VEC4_INTERL1:  insertelement <4 x float> undef, float %[[VAR2]], i32 0
-; VEC4_INTERL1:  shufflevector <4 x float> {{.*}}, <4 x float> undef, <4 x i32> zeroinitializer
-; VEC4_INTERL1:  fadd fast <4 x float> {{.*}}, <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float 1.500000e+00>
-; VEC4_INTERL1:  store <4 x float> 
+; VEC4_INTERL1:       vector.ph:
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[INDUCTION2:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float 1.500000e+00>
+; VEC4_INTERL1-NEXT:    br label %vector.body
+; VEC4_INTERL1:       vector.body:
+; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION2]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[VEC_IND]], <4 x float>* [[TMP8]], align 4
+; VEC4_INTERL1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+; VEC4_INTERL1:         br i1 {{.*}}, label %middle.block, label %vector.body
 
 define void @fp_iv_loop2(float %init, float* noalias nocapture %A, i32 %N) #0 {
 entry:
@@ -133,14 +167,43 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ;    C[i] = y;
 ;  }
 ;}
+
 ; VEC4_INTERL1-LABEL: @fp_iv_loop3(
-; VEC4_INTERL1: vector.body
-; VEC4_INTERL1:  %[[index:.*]] = phi i64 [ 0, %vector.ph ]
-; VEC4_INTERL1: sitofp i64 %[[index]] to float
-; VEC4_INTERL1: %[[VAR1:.*]] = fmul fast float {{.*}}, -5.000000e-01
-; VEC4_INTERL1:  fadd fast float %[[VAR1]]
-; VEC4_INTERL1:  fadd fast <4 x float> {{.*}}, <float -5.000000e-01, float -1.000000e+00, float -1.500000e+00, float -2.000000e+00>
-; VEC4_INTERL1:  store <4 x float>
+; VEC4_INTERL1:       for.body.lr.ph:
+; VEC4_INTERL1:         [[TMP0:%.*]] = load float, float* @fp_inc, align 4
+; VEC4_INTERL1:       vector.ph:
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT5]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[TMP7:%.*]] = fmul fast <4 x float> [[DOTSPLAT6]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; VEC4_INTERL1-NEXT:    [[INDUCTION7:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], [[TMP7]]
+; VEC4_INTERL1-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP0]], 4.000000e+00
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <4 x float> undef, float [[TMP8]], i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT8]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0
+; VEC4_INTERL1-NEXT:    [[BROADCAST_SPLAT13:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT12]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VEC4_INTERL1:       vector.body:
+; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ <float 0x3FB99999A0000000, float 0xBFD99999A0000000, float 0xBFECCCCCC0000000, float 0xBFF6666660000000>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[VEC_IND10:%.*]] = phi <4 x float> [ [[INDUCTION7]], %vector.ph ], [ [[VEC_IND_NEXT11:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[VEC_IND10]], <4 x float>* [[TMP13]], align 4
+; VEC4_INTERL1-NEXT:    [[TMP14:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[BROADCAST_SPLAT13]]
+; VEC4_INTERL1-NEXT:    [[TMP15:%.*]] = fadd fast <4 x float> [[VEC_IND]], <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
+; VEC4_INTERL1-NEXT:    [[TMP16:%.*]] = fadd fast <4 x float> [[TMP15]], [[TMP14]]
+; VEC4_INTERL1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* %B, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP18:%.*]] = bitcast float* [[TMP17]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[TMP16]], <4 x float>* [[TMP18]], align 4
+; VEC4_INTERL1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* %C, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP20:%.*]] = bitcast float* [[TMP19]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[TMP15]], <4 x float>* [[TMP20]], align 4
+; VEC4_INTERL1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00>
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT11]] = fadd fast <4 x float> [[VEC_IND10]], [[DOTSPLAT9]]
+; VEC4_INTERL1:         br i1 {{.*}}, label %middle.block, label %vector.body
 
 define void @fp_iv_loop3(float %init, float* noalias nocapture %A, float* noalias nocapture %B, float* noalias nocapture %C, i32 %N) #1 {
 entry:
@@ -186,10 +249,17 @@ for.end:
 ;}
 
 ; VEC4_INTERL1-LABEL: @fp_iv_loop4(
-; VEC4_INTERL1: vector.body
-; VEC4_INTERL1-NOT: fmul fast <4 x float>
-; VEC4_INTERL1:  %[[induction:.*]] = fadd fast <4 x float> %{{.*}}, <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float 1.500000e+00>
-; VEC4_INTERL1: store <4 x float> %[[induction]]
+; VEC4_INTERL1:       vector.ph:
+; VEC4_INTERL1-NEXT:    br label %vector.body
+; VEC4_INTERL1:       vector.body:
+; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ <float 1.000000e+00, float 1.500000e+00, float 2.000000e+00, float 2.500000e+00>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[VEC_IND]], <4 x float>* [[TMP8]], align 4
+; VEC4_INTERL1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+; VEC4_INTERL1:         br i1 {{.*}}, label %middle.block, label %vector.body
 
 define void @fp_iv_loop4(float* noalias nocapture %A, i32 %N) {
 entry:
@@ -216,3 +286,55 @@ for.end.loopexit:                                 ; preds = %for.body
 for.end:                                          ; preds = %for.end.loopexit, %entry
   ret void
 }
+
+; VEC2_INTERL1_PRED_STORE-LABEL: @non_primary_iv_float_scalar(
+; VEC2_INTERL1_PRED_STORE:       vector.body:
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ], [ 0, %min.iters.checked ]
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP1:%.*]] = sitofp i64 [[INDEX]] to float
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <2 x float>*
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP4:%.*]] = fcmp fast oeq <2 x float> [[WIDE_LOAD]], zeroinitializer
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[TMP5]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VEC2_INTERL1_PRED_STORE:       [[PRED_STORE_IF]]:
+; VEC2_INTERL1_PRED_STORE-NEXT:    store float [[TMP1]], float* [[TMP2]], align 4
+; VEC2_INTERL1_PRED_STORE-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VEC2_INTERL1_PRED_STORE:       [[PRED_STORE_CONTINUE]]:
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[TMP8]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7]]
+; VEC2_INTERL1_PRED_STORE:       [[PRED_STORE_IF6]]:
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP9:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP10:%.*]] = or i64 [[INDEX]], 1
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* %A, i64 [[TMP10]]
+; VEC2_INTERL1_PRED_STORE-NEXT:    store float [[TMP9]], float* [[TMP11]], align 4
+; VEC2_INTERL1_PRED_STORE-NEXT:    br label %[[PRED_STORE_CONTINUE7]]
+; VEC2_INTERL1_PRED_STORE:       [[PRED_STORE_CONTINUE7]]:
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; VEC2_INTERL1_PRED_STORE:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+define void @non_primary_iv_float_scalar(float* %A, i64 %N) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.inc ], [ 0, %entry ]
+  %j = phi float [ %j.next, %for.inc ], [ 0.0, %entry ]
+  %tmp0 = getelementptr inbounds float, float* %A, i64 %i
+  %tmp1 = load float, float* %tmp0, align 4
+  %tmp2 = fcmp fast oeq float %tmp1, 0.0
+  br i1 %tmp2, label %if.pred, label %for.inc
+
+if.pred:
+  store float %j, float* %tmp0, align 4
+  br label %for.inc
+
+for.inc:
+  %i.next = add nuw nsw i64 %i, 1
+  %j.next = fadd fast float %j, 1.0
+  %cond = icmp slt i64 %i.next, %N
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/if-conversion.ll b/test/Transforms/LoopVectorize/if-conversion.ll
index acf7b12540d3..d3a16e2075d1 100644
--- a/test/Transforms/LoopVectorize/if-conversion.ll
+++ b/test/Transforms/LoopVectorize/if-conversion.ll
@@ -18,9 +18,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 ;CHECK-LABEL: @function0(
 ;CHECK: load <4 x i32>
+;CHECK: icmp sle <4 x i32>
 ;CHECK: mul <4 x i32>
 ;CHECK: add <4 x i32>
-;CHECK: icmp sle <4 x i32>
 ;CHECK: select <4 x i1>
 ;CHECK: ret i32
 define i32 @function0(i32* nocapture %a, i32* nocapture %b, i32 %start, i32 %end) nounwind uwtable ssp {
@@ -71,8 +71,8 @@ for.end:
 
 ;CHECK-LABEL: @reduction_func(
 ;CHECK: load <4 x i32>
-;CHECK: add <4 x i32>
 ;CHECK: icmp slt <4 x i32>
+;CHECK: add <4 x i32>
 ;CHECK: select <4 x i1>
 ;CHECK: ret i32
 define i32 @reduction_func(i32* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
diff --git a/test/Transforms/LoopVectorize/if-pred-stores.ll b/test/Transforms/LoopVectorize/if-pred-stores.ll
index c4368148caf9..a1837b352eef 100644
--- a/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -11,6 +11,7 @@ entry:
 
 ; VEC-LABEL: test
 ; VEC:   %[[v0:.+]] = add i64 %index, 0
+; VEC:   %[[v2:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v0]]
 ; VEC:   %[[v8:.+]] = icmp sgt <2 x i32> %{{.*}}, <i32 100, i32 100>
 ; VEC:   %[[v10:.+]] = and <2 x i1> %[[v8]], <i1 true, i1 true>
 ; VEC:   %[[o1:.+]] = or <2 x i1> zeroinitializer, %[[v10]]
@@ -21,7 +22,6 @@ entry:
 ; VEC: [[cond]]:
 ; VEC:   %[[v13:.+]] = extractelement <2 x i32> %wide.load, i32 0
 ; VEC:   %[[v9a:.+]] = add nsw i32 %[[v13]], 20
-; VEC:   %[[v2:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v0]]
 ; VEC:   store i32 %[[v9a]], i32* %[[v2]], align 4
 ; VEC:   br label %[[else:.+]]
 ;
diff --git a/test/Transforms/LoopVectorize/induction-step.ll b/test/Transforms/LoopVectorize/induction-step.ll
index f56456e82dfa..33e8ed067160 100644
--- a/test/Transforms/LoopVectorize/induction-step.ll
+++ b/test/Transforms/LoopVectorize/induction-step.ll
@@ -12,11 +12,30 @@
 ;}
 
 ; CHECK-LABEL: @induction_with_global(
-; CHECK: %[[INT_INC:.*]] = load i32, i32* @int_inc, align 4
-; CHECK: vector.body:
-; CHECK:  %[[VAR1:.*]] = insertelement <8 x i32> undef, i32 %[[INT_INC]], i32 0
-; CHECK:  %[[VAR2:.*]] = shufflevector <8 x i32> %[[VAR1]], <8 x i32> undef, <8 x i32> zeroinitializer
-; CHECK:  mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, %[[VAR2]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @int_inc, align 4
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %init, i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> undef, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT3]]
+; CHECK-NEXT:    [[INDUCTION4:%.*]] = add <8 x i32> [[DOTSPLAT]], [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP0]], 8
+; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP7]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %vector.body
+; CHECK:       vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NEXT:    %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK:         [[TMP8:%.*]] = add i64 %index, 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
+; CHECK-NEXT:    store <8 x i32> %vec.ind, <8 x i32>* [[TMP11]], align 4
+; CHECK:         %index.next = add i64 %index, 8
+; CHECK-NEXT:    %vec.ind.next = add <8 x i32> %vec.ind, [[DOTSPLAT6]]
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -66,13 +85,28 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ;}
 
 ; CHECK-LABEL: @induction_with_loop_inv(
-; CHECK: for.cond1.preheader:                            
-; CHECK: %[[INDVAR0:.*]] = phi i32 [ 0,
-; CHECK: %[[INDVAR1:.*]] = phi i32 [ 0,
-; CHECK: vector.body:
-; CHECK:  %[[VAR1:.*]] = insertelement <8 x i32> undef, i32 %[[INDVAR1]], i32 0
-; CHECK:  %[[VAR2:.*]] = shufflevector <8 x i32> %[[VAR1]], <8 x i32> undef, <8 x i32> zeroinitializer
-; CHECK:  mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, %[[VAR2]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %x.011, i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> undef, i32 %j.012, i32 0
+; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT3]]
+; CHECK-NEXT:    [[INDUCTION4:%.*]] = add <8 x i32> [[DOTSPLAT]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 %j.012, 8
+; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %vector.body
+; CHECK:       vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NEXT:    %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK:         [[TMP6:%.*]] = add i64 %index, 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
+; CHECK-NEXT:    store <8 x i32> %vec.ind, <8 x i32>* [[TMP9]], align 4
+; CHECK:         %index.next = add i64 %index, 8
+; CHECK-NEXT:    %vec.ind.next = add <8 x i32> %vec.ind, [[DOTSPLAT6]]
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
 
 define i32 @induction_with_loop_inv(i32 %init, i32* noalias nocapture %A, i32 %N, i32 %M) {
 entry:
@@ -122,3 +156,46 @@ for.end6:                                         ; preds = %for.end6.loopexit,
   %x.0.lcssa = phi i32 [ %init, %entry ], [ %x.1.lcssa.lcssa, %for.end6.loopexit ]
   ret i32 %x.0.lcssa
 }
+
+
+; CHECK-LABEL: @non_primary_iv_loop_inv_trunc(
+; CHECK:       vector.ph:
+; CHECK:         [[TMP3:%.*]] = trunc i64 %step to i32
+; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT6]]
+; CHECK-NEXT:    [[INDUCTION7:%.*]] = add <8 x i32> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP3]], 8
+; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <8 x i32> undef, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT8]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %vector.body
+; CHECK:       vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:         [[VEC_IND10:%.*]] = phi <8 x i32> [ [[INDUCTION7]], %vector.ph ], [ [[VEC_IND_NEXT11:%.*]], %vector.body ]
+; CHECK:         [[TMP6:%.*]] = add i64 %index, 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
+; CHECK-NEXT:    store <8 x i32> [[VEC_IND10]], <8 x i32>* [[TMP9]], align 4
+; CHECK-NEXT:    %index.next = add i64 %index, 8
+; CHECK:         [[VEC_IND_NEXT11]] = add <8 x i32> [[VEC_IND10]], [[DOTSPLAT9]]
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+define void @non_primary_iv_loop_inv_trunc(i32* %a, i64 %n, i64 %step) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %j = phi i64 [ %j.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = trunc i64 %j to i32
+  store i32 %tmp1, i32* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %j.next = add nuw nsw i64 %j, %step
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/induction.ll b/test/Transforms/LoopVectorize/induction.ll
index 6213b4a7c2e9..0d7d9fe0c1b8 100644
--- a/test/Transforms/LoopVectorize/induction.ll
+++ b/test/Transforms/LoopVectorize/induction.ll
@@ -7,11 +7,19 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Make sure that we can handle multiple integer induction variables.
+;
 ; CHECK-LABEL: @multi_int_induction(
-; CHECK: vector.body:
-; CHECK:  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK:  %[[VAR:.*]] = trunc i64 %index to i32
-; CHECK:  %offset.idx = add i32 190, %[[VAR]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NEXT:    %vec.ind = phi <2 x i32> [ <i32 190, i32 191>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK:         [[TMP3:%.*]] = add i64 %index, 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* %A, i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> %vec.ind, <2 x i32>* [[TMP6]], align 4
+; CHECK:         %index.next = add i64 %index, 2
+; CHECK-NEXT:    %vec.ind.next = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
 define void @multi_int_induction(i32* %A, i32 %N) {
 for.body.lr.ph:
   br label %for.body
@@ -765,3 +773,79 @@ for.body:
 exit:
   ret void
 }
+
+; CHECK-LABEL: @non_primary_iv_trunc(
+; CHECK:       vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:         [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 2>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK:         [[TMP3:%.*]] = add i64 %index, 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* %a, i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    %index.next = add i64 %index, 2
+; CHECK:         [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+define void @non_primary_iv_trunc(i32* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %j = phi i64 [ %j.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = trunc i64 %j to i32
+  store i32 %tmp1, i32* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %j.next = add nuw nsw i64 %j, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; PR32419. Ensure we transform truncated non-primary induction variables. In
+; the test case below we replace %tmp1 with a new induction variable. Because
+; the truncated value is non-primary, we must compute an offset from the
+; primary induction variable.
+;
+; CHECK-LABEL: @PR32419(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[PRED_UREM_CONTINUE4:.*]] ]
+; CHECK:         [[OFFSET_IDX:%.*]] = add i32 -20, [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[OFFSET_IDX]] to i16
+; CHECK:         [[TMP8:%.*]] = add i16 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = urem i16 %b, [[TMP8]]
+; CHECK:         [[TMP15:%.*]] = add i16 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = urem i16 %b, [[TMP15]]
+; CHECK:       [[PRED_UREM_CONTINUE4]]:
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i32 @PR32419(i32 %a, i16 %b) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ -20, %entry ], [ %i.next, %for.inc ]
+  %tmp0 = phi i32 [ %a, %entry ], [ %tmp6, %for.inc ]
+  %tmp1 = trunc i32 %i to i16
+  %tmp2 = icmp eq i16 %tmp1, 0
+  br i1 %tmp2, label %for.inc, label %for.cond
+
+for.cond:
+  %tmp3 = urem i16 %b, %tmp1
+  br label %for.inc
+
+for.inc:
+  %tmp4 = phi i16 [ %tmp3, %for.cond ], [ 0, %for.body ]
+  %tmp5 = sext i16 %tmp4 to i32
+  %tmp6 = or i32 %tmp0, %tmp5
+  %i.next = add nsw i32 %i, 1
+  %cond = icmp eq i32 %i.next, 0
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  %tmp7 = phi i32 [ %tmp6, %for.inc ]
+  ret i32 %tmp7
+}
diff --git a/test/Transforms/LoopVectorize/lcssa-crash.ll b/test/Transforms/LoopVectorize/lcssa-crash.ll
index e6bd6ed61e22..3d3ef9e05935 100644
--- a/test/Transforms/LoopVectorize/lcssa-crash.ll
+++ b/test/Transforms/LoopVectorize/lcssa-crash.ll
@@ -37,3 +37,26 @@ L0:
 L1:
   ret void
 }
+
+; This loop has different uniform instructions before and after LCSSA.
+define void @test3() {
+entry:
+  %add41 = add i32 undef, undef
+  %idxprom4736 = zext i32 %add41 to i64
+  br label %while.body
+
+while.body:
+  %idxprom4738 = phi i64 [ %idxprom47, %while.body ], [ %idxprom4736, %entry ]
+  %pos.337 = phi i32 [ %inc46, %while.body ], [ %add41, %entry ]
+  %inc46 = add i32 %pos.337, 1
+  %arrayidx48 = getelementptr inbounds [1024 x i8], [1024 x i8]* undef, i64 0, i64 %idxprom4738
+  store i8 0, i8* %arrayidx48, align 1
+  %and43 = and i32 %inc46, 3
+  %cmp44 = icmp eq i32 %and43, 0
+  %idxprom47 = zext i32 %inc46 to i64
+  br i1 %cmp44, label %while.end, label %while.body
+
+while.end:
+  %add58 = add i32 %inc46, 4
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/lifetime.ll b/test/Transforms/LoopVectorize/lifetime.ll
index 6e525ca1d822..860fe2d983cd 100644
--- a/test/Transforms/LoopVectorize/lifetime.ll
+++ b/test/Transforms/LoopVectorize/lifetime.ll
@@ -13,23 +13,23 @@ define void @test(i32 *%d) {
 entry:
   %arr = alloca [1024 x i32], align 16
   %0 = bitcast [1024 x i32]* %arr to i8*
-  call void @llvm.lifetime.start(i64 4096, i8* %0) #1
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1
   br label %for.body
 
 for.body:
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  call void @llvm.lifetime.end(i64 4096, i8* %0) #1
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1
   %arrayidx = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
   %1 = load i32, i32* %arrayidx, align 8
   store i32 100, i32* %arrayidx, align 8
-  call void @llvm.lifetime.start(i64 4096, i8* %0) #1
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp ne i32 %lftr.wideiv, 128
   br i1 %exitcond, label %for.body, label %for.end
 
 for.end:
-  call void @llvm.lifetime.end(i64 4096, i8* %0) #1
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1
   ret void
 }
 
@@ -42,24 +42,24 @@ define void @testbitcast(i32 *%d) {
 entry:
   %arr = alloca [1024 x i32], align 16
   %0 = bitcast [1024 x i32]* %arr to i8*
-  call void @llvm.lifetime.start(i64 4096, i8* %0) #1
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1
   br label %for.body
 
 for.body:
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
   %1 = bitcast [1024 x i32]* %arr to i8*
-  call void @llvm.lifetime.end(i64 4096, i8* %1) #1
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %1) #1
   %arrayidx = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
   %2 = load i32, i32* %arrayidx, align 8
   store i32 100, i32* %arrayidx, align 8
-  call void @llvm.lifetime.start(i64 4096, i8* %1) #1
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %1) #1
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp ne i32 %lftr.wideiv, 128
   br i1 %exitcond, label %for.body, label %for.end
 
 for.end:
-  call void @llvm.lifetime.end(i64 4096, i8* %0) #1
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1
   ret void
 }
 
@@ -77,11 +77,11 @@ for.body:
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
   %0 = getelementptr [1024 x i32], [1024 x i32]* %arr, i32 0, i64 %indvars.iv
   %1 = bitcast [1024 x i32]* %arr to i8*
-  call void @llvm.lifetime.end(i64 4096, i8* %1) #1
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %1) #1
   %arrayidx = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
   %2 = load i32, i32* %arrayidx, align 8
   store i32 100, i32* %arrayidx, align 8
-  call void @llvm.lifetime.start(i64 4096, i8* %1) #1
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %1) #1
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp ne i32 %lftr.wideiv, 128
@@ -91,6 +91,6 @@ for.end:
   ret void
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
diff --git a/test/Transforms/LoopVectorize/loop-scalars.ll b/test/Transforms/LoopVectorize/loop-scalars.ll
new file mode 100644
index 000000000000..4dcd5993c128
--- /dev/null
+++ b/test/Transforms/LoopVectorize/loop-scalars.ll
@@ -0,0 +1,143 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: vector_gep
+; CHECK-NOT:   LV: Found scalar instruction: %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, <2 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <2 x i32*>*
+; CHECK-NEXT:    store <2 x i32*> [[TMP1]], <2 x i32*>* [[TMP3]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @vector_gep(i32** %a, i32 *%b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i
+  store i32* %tmp0, i32** %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: scalar_store
+; CHECK:       LV: Found scalar instruction: %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i
+; CHECK-NEXT:  LV: Found scalar instruction: %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
+; CHECK-NEXT:  LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:  LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* %b, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* %b, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[TMP4]]
+; CHECK-NEXT:    store i32* [[TMP5]], i32** [[TMP7]], align 8
+; CHECK-NEXT:    store i32* [[TMP6]], i32** [[TMP8]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @scalar_store(i32** %a, i32 *%b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i
+  store i32* %tmp0, i32** %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: expansion
+; CHECK:       LV: Found scalar instruction: %tmp3 = getelementptr inbounds i32*, i32** %tmp2, i64 %i
+; CHECK-NEXT:  LV: Found scalar instruction: %tmp1 = bitcast i64* %tmp0 to i32*
+; CHECK-NEXT:  LV: Found scalar instruction: %tmp2 = getelementptr inbounds i32*, i32** %a, i64 0
+; CHECK-NEXT:  LV: Found scalar instruction: %tmp0 = getelementptr inbounds i64, i64* %b, i64 %i
+; CHECK-NEXT:  LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:  LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, i64* %b, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, i64* %b, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32** [[TMP7]] to i64**
+; CHECK-NEXT:    store i64* [[TMP5]], i64** [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32** [[TMP8]] to i64**
+; CHECK-NEXT:    store i64* [[TMP6]], i64** [[TMP10]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @expansion(i32** %a, i64 *%b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i64, i64* %b, i64 %i
+  %tmp1 = bitcast i64* %tmp0 to i32*
+  %tmp2 = getelementptr inbounds i32*, i32** %a, i64 0
+  %tmp3 = getelementptr inbounds i32*, i32** %tmp2, i64 %i
+  store i32* %tmp1, i32** %tmp3, align 8
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: no_gep_or_bitcast
+; CHECK-NOT:   LV: Found scalar instruction: %tmp1 = load i32*, i32** %tmp0, align 8
+; CHECK:       LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:  LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 1
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32** [[TMP1]] to <2 x i32*>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32*>, <2 x i32*>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32*> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    store i32 0, i32* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32*> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    store i32 0, i32* [[TMP4]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @no_gep_or_bitcast(i32** noalias %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32*, i32** %a, i64 %i
+  %tmp1 = load i32*, i32** %tmp0, align 8
+  store i32 0, i32* %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll b/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll
index a310b10a5c81..5c87dc435c7c 100644
--- a/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll
+++ b/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll
@@ -13,9 +13,9 @@
 ;       int v3[Z][Z];
 ; } s;
 ;
-; void slow_function (s* const obj) {
+; void slow_function (s* const obj, int z) {
 ;    for (int j=0; j<Z; j++) {
-;        for (int k=0; k<Z; k++) {
+;        for (int k=0; k<z; k++) {
 ;            int x = obj->v1[k] + obj->v2[j];
 ;            obj->v3[j][k] += x;
 ;        }
@@ -31,7 +31,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 %struct.s = type { [32 x i32], [32 x i32], [32 x [32 x i32]] }
 
-define void @Test(%struct.s* nocapture %obj) #0 {
+define void @Test(%struct.s* nocapture %obj, i64 %z) #0 {
   br label %.outer.preheader
 
 
@@ -59,6 +59,6 @@ define void @Test(%struct.s* nocapture %obj) #0 {
   %8 = add nsw i32 %5, %7
   store i32 %8, i32* %6  
   %j.next = add nuw nsw i64 %j, 1
-  %exitcond.inner = icmp eq i64 %j.next, 32
+  %exitcond.inner = icmp eq i64 %j.next, %z
   br i1 %exitcond.inner, label %.outer, label %.inner
 }
diff --git a/test/Transforms/LoopVectorize/partial-lcssa.ll b/test/Transforms/LoopVectorize/partial-lcssa.ll
new file mode 100644
index 000000000000..1306ed971c47
--- /dev/null
+++ b/test/Transforms/LoopVectorize/partial-lcssa.ll
@@ -0,0 +1,54 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
+; We vectorize the inner loop, so we have to put it in LCSSA form.
+; However, there's no reason to touch the outer loop.
+
+; CHECK-LABEL: @foo
+; CHECK-LABEL: for.end.inner.loopexit:
+; CHECK: %[[LCSSAPHI:.*]] = phi i64 [ %indvars.iv, %for.body.inner ], [ %{{.*}}, %middle.block ]
+; CHECK: store i64 %[[LCSSAPHI]], i64* %O1, align 4
+; CHECK-LABEL: for.end.outer.loopexit
+; CHECK: store i64 %indvars.outer, i64* %O2, align 4
+
+
+define i64 @foo(i32* nocapture %A, i32* nocapture %B, i64 %n, i64 %m, i64* %O1, i64* %O2) {
+entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body.outer.preheader, label %for.end.outer
+
+for.body.outer.preheader:                         ; preds = %entry
+  br label %for.body.outer
+
+for.body.outer:                                   ; preds = %for.body.outer.preheader, %for.end.inner
+  %indvars.outer = phi i64 [ %indvars.outer.next, %for.end.inner ], [ 0, %for.body.outer.preheader ]
+  %cmp2 = icmp sgt i64 %m, 0
+  br i1 %cmp2, label %for.body.inner.preheader, label %for.end.inner
+
+for.body.inner.preheader:                         ; preds = %for.body.outer
+  br label %for.body.inner
+
+for.body.inner:                                   ; preds = %for.body.inner.preheader, %for.body.inner
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body.inner ], [ 0, %for.body.inner.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %v = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  store i32 %v, i32* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, %n
+  br i1 %exitcond, label %for.end.inner.loopexit, label %for.body.inner
+
+for.end.inner.loopexit:                           ; preds = %for.body.inner
+  store i64 %indvars.iv, i64 *%O1, align 4
+  br label %for.end.inner
+
+for.end.inner:                                    ; preds = %for.end.inner.loopexit, %for.body.outer
+  %indvars.outer.next = add i64 %indvars.outer, 1
+  %exitcond.outer = icmp eq i64 %indvars.outer, %m
+  br i1 %exitcond.outer, label %for.end.outer.loopexit, label %for.body.outer
+
+for.end.outer.loopexit:                           ; preds = %for.end.inner
+  store i64 %indvars.outer, i64 *%O2, align 4
+  br label %for.end.outer
+
+for.end.outer:                                    ; preds = %for.end.outer.loopexit, %entry
+  ret i64 undef
+}
diff --git a/test/Transforms/LoopVectorize/pr31098.ll b/test/Transforms/LoopVectorize/pr31098.ll
new file mode 100644
index 000000000000..368a948557c3
--- /dev/null
+++ b/test/Transforms/LoopVectorize/pr31098.ll
@@ -0,0 +1,100 @@
+; REQUIRES: asserts
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -debug-only=loop-accesses < %s  2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check that the compile-time-unknown depenendece-distance is resolved 
+; statically. Due to the non-unit stride of the accesses in this testcase
+; we are currently not able to create runtime dependence checks, and therefore
+; if we don't resolve the dependence statically we cannot vectorize the loop.
+;
+; Specifically in this example, during dependence analysis we get 6 unknown 
+; dependence distances between the 8 real/imaginary accesses below: 
+;    dist = 8*D, 4+8*D, -4+8*D, -8*D, 4-8*D, -4-8*D.
+; At compile time we can prove for all of the above that |dist|>loopBound*step
+; (where the step is 8bytes, and the loopBound is D-1), and thereby conclude 
+; that there are no dependencies (without runtime tests):
+; |8*D|>8*D-8, |4+8*D|>8*D-8, |-4+8*D|>8*D-8, etc.
+
+; #include <stdlib.h>
+; class Complex {
+; private:
+;   float real_;
+;   float imaginary_;
+;
+; public:
+;   Complex() : real_(0), imaginary_(0) { }
+;   Complex(float real, float imaginary) : real_(real), imaginary_(imaginary) { }
+;   Complex(const Complex &rhs) : real_(rhs.real()), imaginary_(rhs.imaginary()) { }
+; 
+;   inline float real() const { return real_; }
+;   inline float imaginary() const { return imaginary_; }
+; 
+;   Complex operator+(const Complex& rhs) const
+;   {
+;    return Complex(real_ + rhs.real_, imaginary_ + rhs.imaginary_);
+;   }
+;
+;   Complex operator-(const Complex& rhs) const
+;  {
+;     return Complex(real_ - rhs.real_, imaginary_ - rhs.imaginary_);
+;   }
+; };
+;
+; void Test(Complex *out, size_t size)
+; {
+;     size_t D = size / 2;
+;     for (size_t offset = 0; offset < D; ++offset)
+;     {
+;         Complex t0 = out[offset];
+;         Complex t1 = out[offset + D];
+;         out[offset] = t1 + t0;
+;         out[offset + D] = t0 - t1;
+;     }
+; }
+
+; CHECK-LABEL: Test
+; CHECK: LAA: No unsafe dependent memory operations in loop.  We don't need runtime memory checks.
+; CHECK: vector.body:
+; CHECK: <4 x i32>
+
+%class.Complex = type { float, float }
+
+define void @Test(%class.Complex* nocapture %out, i64 %size) local_unnamed_addr {
+entry:
+  %div = lshr i64 %size, 1
+  %cmp47 = icmp eq i64 %div, 0
+  br i1 %cmp47, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %offset.048 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %0 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %offset.048, i32 0
+  %1 = load float, float* %0, align 4
+  %imaginary_.i.i = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %offset.048, i32 1
+  %2 = load float, float* %imaginary_.i.i, align 4
+  %add = add nuw i64 %offset.048, %div
+  %3 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %add, i32 0
+  %4 = load float, float* %3, align 4
+  %imaginary_.i.i28 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %add, i32 1
+  %5 = load float, float* %imaginary_.i.i28, align 4
+  %add.i = fadd fast float %4, %1
+  %add4.i = fadd fast float %5, %2
+  store float %add.i, float* %0, align 4
+  store float %add4.i, float* %imaginary_.i.i, align 4
+  %sub.i = fsub fast float %1, %4
+  %sub4.i = fsub fast float %2, %5
+  store float %sub.i, float* %3, align 4
+  store float %sub4.i, float* %imaginary_.i.i28, align 4
+  %inc = add nuw nsw i64 %offset.048, 1
+  %exitcond = icmp eq i64 %inc, %div
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/test/Transforms/LoopVectorize/pr31190.ll b/test/Transforms/LoopVectorize/pr31190.ll
index afb1754983cd..1ff8b2ba7ce4 100644
--- a/test/Transforms/LoopVectorize/pr31190.ll
+++ b/test/Transforms/LoopVectorize/pr31190.ll
@@ -9,13 +9,6 @@
 ; Since %inc54 is the IV of the outer loop, and %0 equivalent to it,
 ; we get the situation described above.
 
-; This test uses the new PM, because with the old PM, running loop-vectorize
-; would explicitly run loop-simplify. Even though this loop is already in
-; simplified form, loop-simplify would still clean up the phi.
-; The reason this matters is that in a real optimizer pipeline, LICM can create
-; such PHIs, and since it preserves loop simplified form, the cleanup has
-; no chance to run.
-
 ; Code that leads to this situation can look something like:
 ;
 ; int a, b[1], c;
@@ -28,11 +21,14 @@
 ;
 ; The PHI is an artifact of the register promotion of c.
 
+; Note that we can no longer get the vectorizer to actually see such PHIs,
+; because LV now simplifies the loop internally, but the test is still
+; useful as a regression test, and in case loop-simplify behavior changes.
+
 @c = external global i32, align 4
 @a = external global i32, align 4
 @b = external global [1 x i32], align 4
 
-; CHECK: LV: PHI is a recurrence with respect to an outer loop.
 ; CHECK: LV: Not vectorizing: Cannot prove legality.
 ; CHECK-LABEL: @test
 define void @test() {
diff --git a/test/Transforms/LoopVectorize/reduction.ll b/test/Transforms/LoopVectorize/reduction.ll
index 4b300e04ea26..f521b623fad2 100644
--- a/test/Transforms/LoopVectorize/reduction.ll
+++ b/test/Transforms/LoopVectorize/reduction.ll
@@ -493,3 +493,49 @@ exit:
   %inc.2 = add nsw i32 %inc511.1.inc4.1, 2
   ret i32 %inc.2
 }
+
+;CHECK-LABEL: @reduction_sum_multiuse(
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+;CHECK: add <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+;CHECK: add <4 x i32>
+;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
+;CHECK: %sum.lcssa = phi i32 [ %[[SCALAR:.*]], %.lr.ph ], [ %[[VECTOR:.*]], %middle.block ]
+;CHECK: %sum.copy = phi i32 [ %[[SCALAR]], %.lr.ph ], [ %[[VECTOR]], %middle.block ]
+;CHECK: ret i32
+define i32 @reduction_sum_multiuse(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph.preheader, label %end
+.lr.ph.preheader:                                 ; preds = %0
+  br label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = add i32 %sum.02, %6
+  %8 = add i32 %7, %3
+  %9 = add i32 %8, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.lcssa = phi i32 [ %9, %.lr.ph ]
+  %sum.copy = phi i32 [ %9, %.lr.ph ]
+  br label %end
+
+end:
+  %f1 = phi i32 [ 0, %0 ], [ %sum.lcssa, %._crit_edge ]
+  %f2 = phi i32 [ 0, %0 ], [ %sum.copy, %._crit_edge ]
+  %final = add i32 %f1, %f2
+  ret i32 %final
+}
diff --git a/test/Transforms/LoopVectorize/reverse_iter.ll b/test/Transforms/LoopVectorize/reverse_iter.ll
index a6e2abda36d9..bd057698280b 100644
--- a/test/Transforms/LoopVectorize/reverse_iter.ll
+++ b/test/Transforms/LoopVectorize/reverse_iter.ll
@@ -2,7 +2,8 @@
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
-; Make sure that the reverse iterators are calculated using 64bit arithmetic, not 32.
+; PR15882: This test ensures that we do not produce wrapping arithmetic when
+; creating constant reverse step vectors.
 ;
 ; int foo(int n, int *A) {
 ;   int sum;
@@ -13,7 +14,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ;
 
 ;CHECK-LABEL: @foo(
-;CHECK:  <i64 0, i64 -1, i64 -2, i64 -3>
+;CHECK:  <i32 0, i32 -1, i32 -2, i32 -3>
 ;CHECK: ret
 define i32 @foo(i32 %n, i32* nocapture %A) {
   %1 = icmp sgt i32 %n, 0
diff --git a/test/Transforms/LoopVectorize/unroll-novec-memcheck-metadata.ll b/test/Transforms/LoopVectorize/unroll-novec-memcheck-metadata.ll
new file mode 100644
index 000000000000..d3112b82d1d5
--- /dev/null
+++ b/test/Transforms/LoopVectorize/unroll-novec-memcheck-metadata.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -S | FileCheck --enable-var-scope %s
+
+; Make sure we attach memcheck metadata to scalarized memory operations even if
+; we're only unrolling.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: vector.memcheck:
+; CHECK-LABEL: vector.body:
+; CHECK: load i32, {{.*}} !alias.scope ![[$MD1:[0-9]+]]
+; CHECK-LABEL: middle.block:
+; CHECK-DAG: ![[$MD1]] = !{![[MD2:[0-9]+]]}
+; CHECK-DAG: ![[MD2]] = distinct !{![[MD2]], ![[MD3:[0-9]+]]}
+; CHECK-DAG: ![[MD3]] = distinct !{![[MD3]], !"LVerDomain"}
+
+; Function Attrs: norecurse nounwind uwtable
+define void @test(i32* nocapture readonly %a, i32* nocapture %b) local_unnamed_addr #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, 77
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+attributes #0 = { norecurse nounwind uwtable }
diff --git a/test/Transforms/LoopVectorize/vector-geps.ll b/test/Transforms/LoopVectorize/vector-geps.ll
new file mode 100644
index 000000000000..bd79499d5d34
--- /dev/null
+++ b/test/Transforms/LoopVectorize/vector-geps.ll
@@ -0,0 +1,61 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: @vector_gep_stored(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <4 x i32*>*
+; CHECK-NEXT:    store <4 x i32*> [[TMP1]], <4 x i32*>* [[TMP3]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @vector_gep_stored(i32** %a, i32 *%b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i
+  store i32* %tmp0, i32** %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: @uniform_vector_gep_stored(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, i64 1
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32*> undef, i32* [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32*> [[DOTSPLATINSERT]], <4 x i32*> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <4 x i32*>*
+; CHECK-NEXT:    store <4 x i32*> [[DOTSPLAT]], <4 x i32*>* [[TMP3]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @uniform_vector_gep_stored(i32** %a, i32 *%b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %b, i64 1
+  %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i
+  store i32* %tmp0, i32** %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVersioning/loop-invariant-bound.ll b/test/Transforms/LoopVersioning/loop-invariant-bound.ll
index 3411adbf245e..01c5a55bd5b2 100644
--- a/test/Transforms/LoopVersioning/loop-invariant-bound.ll
+++ b/test/Transforms/LoopVersioning/loop-invariant-bound.ll
@@ -8,12 +8,13 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 %Partials.215 = type { [2 x %Dual.213] }
 
 ; Function Attrs: sspreq
-define void @"julia_axpy!_65480"(%Dual.212*) {
+define void @"julia_axpy!_65480"(%Dual.212*, %Dual.212* %other) {
 top:
   br label %if24
 
 ; CHECK-NOT: %bc = bitcast i64* %v2.sroa.0.0..sroa_cast
-; CHECK: %bound0
+; CHECK: %bound0 = icmp ult i8* %[[x:[a-z0-9]+]], %[[y:[a-z0-9]+]]
+; CHECK-NOT: %bound1 = icmp ult i8* %[[y]], %[[x]]
 
 if24:                                             ; preds = %if24, %top
   %"#temp#1.sroa.3.02" = phi i64 [ undef, %top ], [ %2, %if24 ]
@@ -24,7 +25,7 @@ if24:                                             ; preds = %if24, %top
   %v2.sroa.0.0..sroa_cast = bitcast %Dual.212* %0 to i64*
   %v2.sroa.0.0.copyload = load i64, i64* %v2.sroa.0.0..sroa_cast, align 1
   %3 = add i64 %"#temp#1.sroa.0.01", -1
-  %4 = getelementptr inbounds %Dual.212, %Dual.212* undef, i64 %3, i32 1, i32 0, i64 0, i32 1, i32 0, i64 0
+  %4 = getelementptr inbounds %Dual.212, %Dual.212* %other, i64 0, i32 1, i32 0, i64 0, i32 1, i32 0, i64 0
   %5 = bitcast double* %4 to i64*
   store i64 undef, i64* %5, align 8
   %notlhs27 = icmp eq i64 %2, undef
diff --git a/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll b/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll
index ff6c25087aa5..791c2e3210c8 100644
--- a/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll
+++ b/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll
@@ -16,7 +16,7 @@
 ; CHECK-NEXT: %add8 = add nsw i32 %[[induction]], %add
 ; CHECK-NEXT: %inc = add nuw i32 %j.113, 1
 ; CHECK-NEXT: %cmp2 = icmp ult i32 %inc, %itr
-; CHECK-NEXT: br i1 %cmp2, label %for.body3, label %for.inc11.loopexit.loopexit6, !llvm.loop !5
+; CHECK-NEXT: br i1 %cmp2, label %for.body3, label %for.inc11.loopexit.loopexit7, !llvm.loop !5
 define i32 @foo(i32* nocapture %var1, i32* nocapture readnone %var2, i32* nocapture %var3, i32 %itr) #0 {
 entry:
   %cmp14 = icmp eq i32 %itr, 0
diff --git a/test/Transforms/LoopVersioningLICM/loopversioningLICM2.ll b/test/Transforms/LoopVersioningLICM/loopversioningLICM2.ll
index 928a6527badc..53add6338022 100644
--- a/test/Transforms/LoopVersioningLICM/loopversioningLICM2.ll
+++ b/test/Transforms/LoopVersioningLICM/loopversioningLICM2.ll
@@ -7,7 +7,7 @@
 ; CHECK: Loop: Loop at depth 2 containing: %for.body3.us<header><latch><exiting>
 ; CHECK-NEXT:     Loop Versioning found to be beneficial
 ;
-; CHECK: for.cond1.for.inc17_crit_edge.us.loopexit5:       ; preds = %for.body3.us
+; CHECK: for.cond1.for.inc17_crit_edge.us.loopexit6:       ; preds = %for.body3.us
 ; CHECK-NEXT: %add14.us.lcssa = phi float [ %add14.us, %for.body3.us ]
 ; CHECK-NEXT: store float %add14.us.lcssa, float* %arrayidx.us, align 4, !alias.scope !0, !noalias !0
 ; CHECK-NEXT: br label %for.cond1.for.inc17_crit_edge.us
diff --git a/test/Transforms/LowerTypeTests/Inputs/import.yaml b/test/Transforms/LowerTypeTests/Inputs/import.yaml
new file mode 100644
index 000000000000..d4a5c2c3c255
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/Inputs/import.yaml
@@ -0,0 +1,31 @@
+---
+TypeIdMap:
+  allones7:
+    TTRes:
+      Kind: AllOnes
+      SizeM1BitWidth: 7
+  allones32:
+    TTRes:
+      Kind: AllOnes
+      SizeM1BitWidth: 32
+  bytearray7:
+    TTRes:
+      Kind: ByteArray
+      SizeM1BitWidth: 7
+  bytearray32:
+    TTRes:
+      Kind: ByteArray
+      SizeM1BitWidth: 32
+  inline5:
+    TTRes:
+      Kind: Inline
+      SizeM1BitWidth: 5
+  inline6:
+    TTRes:
+      Kind: Inline
+      SizeM1BitWidth: 6
+  single:
+    TTRes:
+      Kind: Single
+      SizeM1BitWidth: 0
+...
diff --git a/test/Transforms/LowerTypeTests/Inputs/use-typeid1-typeid2.yaml b/test/Transforms/LowerTypeTests/Inputs/use-typeid1-typeid2.yaml
new file mode 100644
index 000000000000..031b2e8de04e
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/Inputs/use-typeid1-typeid2.yaml
@@ -0,0 +1,5 @@
+---
+GlobalValueMap:
+  42:
+    - TypeTests: [14276520915468743435, 15427464259790519041] # guid("typeid1"), guid("typeid2")
+...
diff --git a/test/Transforms/LowerTypeTests/export-allones.ll b/test/Transforms/LowerTypeTests/export-allones.ll
new file mode 100644
index 000000000000..a642ec87355f
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/export-allones.ll
@@ -0,0 +1,161 @@
+; RUN: opt -S -lowertypetests -lowertypetests-summary-action=export -lowertypetests-read-summary=%S/Inputs/use-typeid1-typeid2.yaml -lowertypetests-write-summary=%t < %s | FileCheck %s
+; RUN: FileCheck --check-prefix=SUMMARY %s < %t
+
+@foo = constant [2048 x i8] zeroinitializer, !type !0, !type !1, !type !2, !type !3, !type !4, !type !5, !type !6, !type !7, !type !8, !type !9, !type !10, !type !11, !type !12, !type !13, !type !14, !type !15, !type !16, !type !17, !type !18, !type !19, !type !20, !type !21, !type !22, !type !23, !type !24, !type !25, !type !26, !type !27, !type !28, !type !29, !type !30, !type !31, !type !32, !type !33, !type !34, !type !35, !type !36, !type !37, !type !38, !type !39, !type !40, !type !41, !type !42, !type !43, !type !44, !type !45, !type !46, !type !47, !type !48, !type !49, !type !50, !type !51, !type !52, !type !53, !type !54, !type !55, !type !56, !type !57, !type !58, !type !59, !type !60, !type !61, !type !62, !type !63, !type !64, !type !65, !type !66, !type !67, !type !68, !type !69, !type !70, !type !71, !type !72, !type !73, !type !74, !type !75, !type !76, !type !77, !type !78, !type !79, !type !80, !type !81, !type !82, !type !83, !type !84, !type !85, !type !86, !type !87, !type !88, !type !89, !type !90, !type !91, !type !92, !type !93, !type !94, !type !95, !type !96, !type !97, !type !98, !type !99, !type !100, !type !101, !type !102, !type !103, !type !104, !type !105, !type !106, !type !107, !type !108, !type !109, !type !110, !type !111, !type !112, !type !113, !type !114, !type !115, !type !116, !type !117, !type !118, !type !119, !type !120, !type !121, !type !122, !type !123, !type !124, !type !125, !type !126, !type !127, !type !128, !type !129, !type !130
+
+!0 = !{i32 0, !"typeid1"}
+!1 = !{i32 2, !"typeid1"}
+
+!2 = !{i32 4, !"typeid2"}
+!3 = !{i32 8, !"typeid2"}
+!4 = !{i32 12, !"typeid2"}
+!5 = !{i32 16, !"typeid2"}
+!6 = !{i32 20, !"typeid2"}
+!7 = !{i32 24, !"typeid2"}
+!8 = !{i32 28, !"typeid2"}
+!9 = !{i32 32, !"typeid2"}
+!10 = !{i32 36, !"typeid2"}
+!11 = !{i32 40, !"typeid2"}
+!12 = !{i32 44, !"typeid2"}
+!13 = !{i32 48, !"typeid2"}
+!14 = !{i32 52, !"typeid2"}
+!15 = !{i32 56, !"typeid2"}
+!16 = !{i32 60, !"typeid2"}
+!17 = !{i32 64, !"typeid2"}
+!18 = !{i32 68, !"typeid2"}
+!19 = !{i32 72, !"typeid2"}
+!20 = !{i32 76, !"typeid2"}
+!21 = !{i32 80, !"typeid2"}
+!22 = !{i32 84, !"typeid2"}
+!23 = !{i32 88, !"typeid2"}
+!24 = !{i32 92, !"typeid2"}
+!25 = !{i32 96, !"typeid2"}
+!26 = !{i32 100, !"typeid2"}
+!27 = !{i32 104, !"typeid2"}
+!28 = !{i32 108, !"typeid2"}
+!29 = !{i32 112, !"typeid2"}
+!30 = !{i32 116, !"typeid2"}
+!31 = !{i32 120, !"typeid2"}
+!32 = !{i32 124, !"typeid2"}
+!33 = !{i32 128, !"typeid2"}
+!34 = !{i32 132, !"typeid2"}
+!35 = !{i32 136, !"typeid2"}
+!36 = !{i32 140, !"typeid2"}
+!37 = !{i32 144, !"typeid2"}
+!38 = !{i32 148, !"typeid2"}
+!39 = !{i32 152, !"typeid2"}
+!40 = !{i32 156, !"typeid2"}
+!41 = !{i32 160, !"typeid2"}
+!42 = !{i32 164, !"typeid2"}
+!43 = !{i32 168, !"typeid2"}
+!44 = !{i32 172, !"typeid2"}
+!45 = !{i32 176, !"typeid2"}
+!46 = !{i32 180, !"typeid2"}
+!47 = !{i32 184, !"typeid2"}
+!48 = !{i32 188, !"typeid2"}
+!49 = !{i32 192, !"typeid2"}
+!50 = !{i32 196, !"typeid2"}
+!51 = !{i32 200, !"typeid2"}
+!52 = !{i32 204, !"typeid2"}
+!53 = !{i32 208, !"typeid2"}
+!54 = !{i32 212, !"typeid2"}
+!55 = !{i32 216, !"typeid2"}
+!56 = !{i32 220, !"typeid2"}
+!57 = !{i32 224, !"typeid2"}
+!58 = !{i32 228, !"typeid2"}
+!59 = !{i32 232, !"typeid2"}
+!60 = !{i32 236, !"typeid2"}
+!61 = !{i32 240, !"typeid2"}
+!62 = !{i32 244, !"typeid2"}
+!63 = !{i32 248, !"typeid2"}
+!64 = !{i32 252, !"typeid2"}
+!65 = !{i32 256, !"typeid2"}
+!66 = !{i32 260, !"typeid2"}
+!67 = !{i32 264, !"typeid2"}
+!68 = !{i32 268, !"typeid2"}
+!69 = !{i32 272, !"typeid2"}
+!70 = !{i32 276, !"typeid2"}
+!71 = !{i32 280, !"typeid2"}
+!72 = !{i32 284, !"typeid2"}
+!73 = !{i32 288, !"typeid2"}
+!74 = !{i32 292, !"typeid2"}
+!75 = !{i32 296, !"typeid2"}
+!76 = !{i32 300, !"typeid2"}
+!77 = !{i32 304, !"typeid2"}
+!78 = !{i32 308, !"typeid2"}
+!79 = !{i32 312, !"typeid2"}
+!80 = !{i32 316, !"typeid2"}
+!81 = !{i32 320, !"typeid2"}
+!82 = !{i32 324, !"typeid2"}
+!83 = !{i32 328, !"typeid2"}
+!84 = !{i32 332, !"typeid2"}
+!85 = !{i32 336, !"typeid2"}
+!86 = !{i32 340, !"typeid2"}
+!87 = !{i32 344, !"typeid2"}
+!88 = !{i32 348, !"typeid2"}
+!89 = !{i32 352, !"typeid2"}
+!90 = !{i32 356, !"typeid2"}
+!91 = !{i32 360, !"typeid2"}
+!92 = !{i32 364, !"typeid2"}
+!93 = !{i32 368, !"typeid2"}
+!94 = !{i32 372, !"typeid2"}
+!95 = !{i32 376, !"typeid2"}
+!96 = !{i32 380, !"typeid2"}
+!97 = !{i32 384, !"typeid2"}
+!98 = !{i32 388, !"typeid2"}
+!99 = !{i32 392, !"typeid2"}
+!100 = !{i32 396, !"typeid2"}
+!101 = !{i32 400, !"typeid2"}
+!102 = !{i32 404, !"typeid2"}
+!103 = !{i32 408, !"typeid2"}
+!104 = !{i32 412, !"typeid2"}
+!105 = !{i32 416, !"typeid2"}
+!106 = !{i32 420, !"typeid2"}
+!107 = !{i32 424, !"typeid2"}
+!108 = !{i32 428, !"typeid2"}
+!109 = !{i32 432, !"typeid2"}
+!110 = !{i32 436, !"typeid2"}
+!111 = !{i32 440, !"typeid2"}
+!112 = !{i32 444, !"typeid2"}
+!113 = !{i32 448, !"typeid2"}
+!114 = !{i32 452, !"typeid2"}
+!115 = !{i32 456, !"typeid2"}
+!116 = !{i32 460, !"typeid2"}
+!117 = !{i32 464, !"typeid2"}
+!118 = !{i32 468, !"typeid2"}
+!119 = !{i32 472, !"typeid2"}
+!120 = !{i32 476, !"typeid2"}
+!121 = !{i32 480, !"typeid2"}
+!122 = !{i32 484, !"typeid2"}
+!123 = !{i32 488, !"typeid2"}
+!124 = !{i32 492, !"typeid2"}
+!125 = !{i32 496, !"typeid2"}
+!126 = !{i32 500, !"typeid2"}
+!127 = !{i32 504, !"typeid2"}
+!128 = !{i32 508, !"typeid2"}
+!129 = !{i32 512, !"typeid2"}
+!130 = !{i32 516, !"typeid2"}
+
+; CHECK: [[G:@[0-9]+]] = private constant { [2048 x i8] } zeroinitializer
+
+; CHECK: @__typeid_typeid1_global_addr = hidden alias i8, getelementptr inbounds ({ [2048 x i8] }, { [2048 x i8] }* [[G]], i32 0, i32 0, i32 0)
+; CHECK: @__typeid_typeid1_align = hidden alias i8, inttoptr (i8 1 to i8*)
+; CHECK: @__typeid_typeid1_size_m1 = hidden alias i8, inttoptr (i64 1 to i8*)
+
+; CHECK: @__typeid_typeid2_global_addr = hidden alias i8, getelementptr inbounds ({ [2048 x i8] }, { [2048 x i8] }* [[G]], i32 0, i32 0, i64 4)
+; CHECK: @__typeid_typeid2_align = hidden alias i8, inttoptr (i8 2 to i8*)
+; CHECK: @__typeid_typeid2_size_m1 = hidden alias i8, inttoptr (i64 128 to i8*)
+
+; CHECK: @foo = alias [2048 x i8], getelementptr inbounds ({ [2048 x i8] }, { [2048 x i8] }* [[G]], i32 0, i32 0)
+
+; SUMMARY:      TypeIdMap:
+; SUMMARY-NEXT:   typeid1:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            AllOnes
+; SUMMARY-NEXT:       SizeM1BitWidth:  7
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:   typeid2:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            AllOnes
+; SUMMARY-NEXT:       SizeM1BitWidth:  32
+; SUMMARY-NEXT:     WPDRes:
diff --git a/test/Transforms/LowerTypeTests/export-bytearray.ll b/test/Transforms/LowerTypeTests/export-bytearray.ll
new file mode 100644
index 000000000000..7565b85df30f
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/export-bytearray.ll
@@ -0,0 +1,40 @@
+; RUN: opt -S -lowertypetests -lowertypetests-summary-action=export -lowertypetests-read-summary=%S/Inputs/use-typeid1-typeid2.yaml -lowertypetests-write-summary=%t < %s | FileCheck %s
+; RUN: FileCheck --check-prefix=SUMMARY %s < %t
+
+@foo = constant [2048 x i8] zeroinitializer, !type !0, !type !1, !type !2, !type !3
+
+!0 = !{i32 0, !"typeid1"}
+!1 = !{i32 130, !"typeid1"}
+!2 = !{i32 4, !"typeid2"}
+!3 = !{i32 1032, !"typeid2"}
+
+; CHECK: [[G:@[0-9]+]] = private constant { [2048 x i8] } zeroinitializer
+; CHECK: [[B:@[0-9]+]] = private constant [258 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\02\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01"
+
+; CHECK: @__typeid_typeid1_global_addr = hidden alias i8, getelementptr inbounds ({ [2048 x i8] }, { [2048 x i8] }* [[G]], i32 0, i32 0, i32 0)
+; CHECK: @__typeid_typeid1_align = hidden alias i8, inttoptr (i8 1 to i8*)
+; CHECK: @__typeid_typeid1_size_m1 = hidden alias i8, inttoptr (i64 65 to i8*)
+; CHECK: @__typeid_typeid1_byte_array = hidden alias i8, i8* @bits.1
+; CHECK: @__typeid_typeid1_bit_mask = hidden alias i8, inttoptr (i8 2 to i8*)
+
+; CHECK: @__typeid_typeid2_global_addr = hidden alias i8, getelementptr inbounds ({ [2048 x i8] }, { [2048 x i8] }* [[G]], i32 0, i32 0, i64 4)
+; CHECK: @__typeid_typeid2_align = hidden alias i8, inttoptr (i8 2 to i8*)
+; CHECK: @__typeid_typeid2_size_m1 = hidden alias i8, inttoptr (i64 257 to i8*)
+; CHECK: @__typeid_typeid2_byte_array = hidden alias i8, i8* @bits
+; CHECK: @__typeid_typeid2_bit_mask = hidden alias i8, inttoptr (i8 1 to i8*)
+
+; CHECK: @foo = alias [2048 x i8], getelementptr inbounds ({ [2048 x i8] }, { [2048 x i8] }* [[G]], i32 0, i32 0)
+; CHECK: @bits = private alias i8, getelementptr inbounds ([258 x i8], [258 x i8]* [[B]], i64 0, i64 0)
+; CHECK: @bits.1 = private alias i8, getelementptr inbounds ([258 x i8], [258 x i8]* [[B]], i64 0, i64 0)
+
+; SUMMARY:      TypeIdMap:
+; SUMMARY-NEXT:   typeid1:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            ByteArray
+; SUMMARY-NEXT:       SizeM1BitWidth:  7
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:   typeid2:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            ByteArray
+; SUMMARY-NEXT:       SizeM1BitWidth:  32
+; SUMMARY-NEXT:     WPDRes:
diff --git a/test/Transforms/LowerTypeTests/export-inline.ll b/test/Transforms/LowerTypeTests/export-inline.ll
new file mode 100644
index 000000000000..1da5866e88cc
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/export-inline.ll
@@ -0,0 +1,35 @@
+; RUN: opt -S -lowertypetests -lowertypetests-summary-action=export -lowertypetests-read-summary=%S/Inputs/use-typeid1-typeid2.yaml -lowertypetests-write-summary=%t < %s | FileCheck %s
+; RUN: FileCheck --check-prefix=SUMMARY %s < %t
+
+@foo = constant [2048 x i8] zeroinitializer, !type !0, !type !1, !type !2, !type !3
+
+!0 = !{i32 0, !"typeid1"}
+!1 = !{i32 6, !"typeid1"}
+!2 = !{i32 4, !"typeid2"}
+!3 = !{i32 136, !"typeid2"}
+
+; CHECK: [[G:@[0-9]+]] = private constant { [2048 x i8] } zeroinitializer
+
+; CHECK: @__typeid_typeid1_global_addr = hidden alias i8, getelementptr inbounds ({ [2048 x i8] }, { [2048 x i8] }* [[G]], i32 0, i32 0, i32 0)
+; CHECK: @__typeid_typeid1_align = hidden alias i8, inttoptr (i8 1 to i8*)
+; CHECK: @__typeid_typeid1_size_m1 = hidden alias i8, inttoptr (i64 3 to i8*)
+; CHECK: @__typeid_typeid1_inline_bits = hidden alias i8, inttoptr (i32 9 to i8*)
+
+; CHECK: @__typeid_typeid2_global_addr = hidden alias i8, getelementptr inbounds ({ [2048 x i8] }, { [2048 x i8] }* [[G]], i32 0, i32 0, i64 4)
+; CHECK: @__typeid_typeid2_align = hidden alias i8, inttoptr (i8 2 to i8*)
+; CHECK: @__typeid_typeid2_size_m1 = hidden alias i8, inttoptr (i64 33 to i8*)
+; CHECK: @__typeid_typeid2_inline_bits = hidden alias i8, inttoptr (i64 8589934593 to i8*)
+
+; CHECK: @foo = alias [2048 x i8], getelementptr inbounds ({ [2048 x i8] }, { [2048 x i8] }* [[G]], i32 0, i32 0)
+
+; SUMMARY:      TypeIdMap:
+; SUMMARY-NEXT:   typeid1:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Inline
+; SUMMARY-NEXT:       SizeM1BitWidth:  5
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:   typeid2:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Inline
+; SUMMARY-NEXT:       SizeM1BitWidth:  6
+; SUMMARY-NEXT:     WPDRes:
diff --git a/test/Transforms/LowerTypeTests/export-single.ll b/test/Transforms/LowerTypeTests/export-single.ll
new file mode 100644
index 000000000000..92e810c09776
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/export-single.ll
@@ -0,0 +1,17 @@
+; RUN: opt -S -lowertypetests -lowertypetests-summary-action=export -lowertypetests-read-summary=%S/Inputs/use-typeid1-typeid2.yaml -lowertypetests-write-summary=%t < %s | FileCheck %s
+; RUN: FileCheck --check-prefix=SUMMARY %s < %t
+
+@foo = constant i32 42, !type !0
+
+!0 = !{i32 0, !"typeid1"}
+
+; CHECK: [[G:@[0-9]+]] = private constant { i32 } { i32 42 }
+
+; CHECK: @__typeid_typeid1_global_addr = hidden alias i8, bitcast ({ i32 }* [[G]] to i8*)
+; CHECK: @foo = alias i32, getelementptr inbounds ({ i32 }, { i32 }* [[G]], i32 0, i32 0)
+
+; SUMMARY:      TypeIdMap:
+; SUMMARY-NEXT:   typeid1:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Single
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
diff --git a/test/Transforms/LowerTypeTests/external-global.ll b/test/Transforms/LowerTypeTests/external-global.ll
new file mode 100644
index 000000000000..0b80374aed74
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/external-global.ll
@@ -0,0 +1,14 @@
+; RUN: opt -S -lowertypetests -lowertypetests-summary-action=export -o - %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-scei-ps4"
+
+; CHECK: @dipsy = external
+@dipsy = external constant i8, !type !0
+
+define void @tinkywinky() {
+  store i8* @dipsy, i8** undef
+  ret void
+}
+
+!0 = !{i64 16, !"teletubbies"}
diff --git a/test/Transforms/LowerTypeTests/import-unsat.ll b/test/Transforms/LowerTypeTests/import-unsat.ll
index 7410bc4b4d88..76b244001986 100644
--- a/test/Transforms/LowerTypeTests/import-unsat.ll
+++ b/test/Transforms/LowerTypeTests/import-unsat.ll
@@ -4,8 +4,7 @@
 
 ; SUMMARY:      GlobalValueMap:
 ; SUMMARY-NEXT:   42:
-; SUMMARY-NEXT:     - TypeTests:
-; SUMMARY-NEXT:         - 123
+; SUMMARY-NEXT:     - TypeTests: [ 123 ]
 ; SUMMARY-NEXT: TypeIdMap:
 ; SUMMARY-NEXT:   typeid1:
 ; SUMMARY-NEXT:     TTRes:
diff --git a/test/Transforms/LowerTypeTests/import.ll b/test/Transforms/LowerTypeTests/import.ll
new file mode 100644
index 000000000000..1a5aceccd631
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/import.ll
@@ -0,0 +1,170 @@
+; RUN: opt -S -lowertypetests -lowertypetests-summary-action=import -lowertypetests-read-summary=%S/Inputs/import.yaml < %s | FileCheck %s
+
+target datalayout = "e-p:64:64"
+
+declare i1 @llvm.type.test(i8* %ptr, metadata %bitset) nounwind readnone
+
+; CHECK-DAG: @__typeid_single_global_addr = external hidden global i8
+; CHECK-DAG: @__typeid_inline6_global_addr = external hidden global i8
+; CHECK-DAG: @__typeid_inline6_align = external hidden global i8, !absolute_symbol !0
+; CHECK-DAG: @__typeid_inline6_size_m1 = external hidden global i8, !absolute_symbol !1
+; CHECK-DAG: @__typeid_inline6_inline_bits = external hidden global i8, !absolute_symbol !2
+; CHECK-DAG: @__typeid_inline5_global_addr = external hidden global i8
+; CHECK-DAG: @__typeid_inline5_align = external hidden global i8, !absolute_symbol !0
+; CHECK-DAG: @__typeid_inline5_size_m1 = external hidden global i8, !absolute_symbol !3
+; CHECK-DAG: @__typeid_inline5_inline_bits = external hidden global i8, !absolute_symbol !4
+; CHECK-DAG: @__typeid_bytearray32_global_addr = external hidden global i8
+; CHECK-DAG: @__typeid_bytearray32_align = external hidden global i8, !absolute_symbol !0
+; CHECK-DAG: @__typeid_bytearray32_size_m1 = external hidden global i8, !absolute_symbol !4
+; CHECK-DAG: @__typeid_bytearray32_byte_array = external hidden global i8
+; CHECK-DAG: @__typeid_bytearray32_bit_mask = external hidden global i8, !absolute_symbol !0
+; CHECK-DAG: @__typeid_bytearray7_global_addr = external hidden global i8
+; CHECK-DAG: @__typeid_bytearray7_align = external hidden global i8, !absolute_symbol !0
+; CHECK-DAG: @__typeid_bytearray7_size_m1 = external hidden global i8, !absolute_symbol !5
+; CHECK-DAG: @__typeid_bytearray7_byte_array = external hidden global i8
+; CHECK-DAG: @__typeid_bytearray7_bit_mask = external hidden global i8, !absolute_symbol !0
+; CHECK-DAG: @__typeid_allones32_global_addr = external hidden global i8
+; CHECK-DAG: @__typeid_allones32_align = external hidden global i8, !absolute_symbol !0
+; CHECK-DAG: @__typeid_allones32_size_m1 = external hidden global i8, !absolute_symbol !4
+; CHECK-DAG: @__typeid_allones7_global_addr = external hidden global i8
+; CHECK-DAG: @__typeid_allones7_align = external hidden global i8, !absolute_symbol !0
+; CHECK-DAG: @__typeid_allones7_size_m1 = external hidden global i8, !absolute_symbol !5
+
+; CHECK: define i1 @allones7(i8* [[p:%.*]])
+define i1 @allones7(i8* %p) {
+  ; CHECK-NEXT: [[pi:%.*]] = ptrtoint i8* [[p]] to i64
+  ; CHECK-NEXT: [[sub:%.*]] = sub i64 [[pi]], ptrtoint (i8* @__typeid_allones7_global_addr to i64)
+  ; CHECK-NEXT: [[lshr:%.*]] = lshr i64 [[sub]], zext (i8 ptrtoint (i8* @__typeid_allones7_align to i8) to i64)
+  ; CHECK-NEXT: [[shl:%.*]] = shl i64 [[sub]], zext (i8 sub (i8 64, i8 ptrtoint (i8* @__typeid_allones7_align to i8)) to i64)
+  ; CHECK-NEXT: [[or:%.*]] = or i64 [[lshr]], [[shl]]
+  ; CHECK-NEXT: [[ule:%.*]] = icmp ule i64 [[or]], ptrtoint (i8* @__typeid_allones7_size_m1 to i64)
+  ; CHECK-NEXT: ret i1 [[ule]]
+  %x = call i1 @llvm.type.test(i8* %p, metadata !"allones7")
+  ret i1 %x
+}
+
+; CHECK: define i1 @allones32(i8* [[p:%.*]])
+define i1 @allones32(i8* %p) {
+  ; CHECK-NEXT: [[pi:%.*]] = ptrtoint i8* [[p]] to i64
+  ; CHECK-NEXT: [[sub:%.*]] = sub i64 [[pi]], ptrtoint (i8* @__typeid_allones32_global_addr to i64)
+  ; CHECK-NEXT: [[lshr:%.*]] = lshr i64 [[sub]], zext (i8 ptrtoint (i8* @__typeid_allones32_align to i8) to i64)
+  ; CHECK-NEXT: [[shl:%.*]] = shl i64 [[sub]], zext (i8 sub (i8 64, i8 ptrtoint (i8* @__typeid_allones32_align to i8)) to i64)
+  ; CHECK-NEXT: [[or:%.*]] = or i64 [[lshr]], [[shl]]
+  ; CHECK-NEXT: [[ule:%.*]] = icmp ule i64 [[or]], ptrtoint (i8* @__typeid_allones32_size_m1 to i64)
+  ; CHECK-NEXT: ret i1 [[ule]]
+  %x = call i1 @llvm.type.test(i8* %p, metadata !"allones32")
+  ret i1 %x
+}
+
+; CHECK: define i1 @bytearray7(i8* [[p:%.*]])
+define i1 @bytearray7(i8* %p) {
+  ; CHECK-NEXT: [[pi:%.*]] = ptrtoint i8* [[p]] to i64
+  ; CHECK-NEXT: [[sub:%.*]] = sub i64 [[pi]], ptrtoint (i8* @__typeid_bytearray7_global_addr to i64)
+  ; CHECK-NEXT: [[lshr:%.*]] = lshr i64 [[sub]], zext (i8 ptrtoint (i8* @__typeid_bytearray7_align to i8) to i64)
+  ; CHECK-NEXT: [[shl:%.*]] = shl i64 [[sub]], zext (i8 sub (i8 64, i8 ptrtoint (i8* @__typeid_bytearray7_align to i8)) to i64)
+  ; CHECK-NEXT: [[or:%.*]] = or i64 [[lshr]], [[shl]]
+  ; CHECK-NEXT: [[ule:%.*]] = icmp ule i64 [[or]], ptrtoint (i8* @__typeid_bytearray7_size_m1 to i64)
+  ; CHECK-NEXT: br i1 [[ule]], label %[[t:.*]], label %[[f:.*]]
+
+  ; CHECK: [[t]]:
+  ; CHECK-NEXT: [[gep:%.*]] = getelementptr i8, i8* @__typeid_bytearray7_byte_array, i64 [[or]]
+  ; CHECK-NEXT: [[load:%.*]] = load i8, i8* [[gep]]
+  ; CHECK-NEXT: [[and:%.*]] = and i8 [[load]], ptrtoint (i8* @__typeid_bytearray7_bit_mask to i8)
+  ; CHECK-NEXT: [[ne:%.*]] = icmp ne i8 [[and]], 0
+  ; CHECK-NEXT: br label %[[f]]
+
+  ; CHECK: [[f]]:
+  ; CHECK-NEXT: [[phi:%.*]] = phi i1 [ false, %0 ], [ [[ne]], %[[t]] ]
+  ; CHECK-NEXT: ret i1 [[phi]]
+  %x = call i1 @llvm.type.test(i8* %p, metadata !"bytearray7")
+  ret i1 %x
+}
+
+; CHECK: define i1 @bytearray32(i8* [[p:%.*]])
+define i1 @bytearray32(i8* %p) {
+  ; CHECK-NEXT: [[pi:%.*]] = ptrtoint i8* [[p]] to i64
+  ; CHECK-NEXT: [[sub:%.*]] = sub i64 [[pi]], ptrtoint (i8* @__typeid_bytearray32_global_addr to i64)
+  ; CHECK-NEXT: [[lshr:%.*]] = lshr i64 [[sub]], zext (i8 ptrtoint (i8* @__typeid_bytearray32_align to i8) to i64)
+  ; CHECK-NEXT: [[shl:%.*]] = shl i64 [[sub]], zext (i8 sub (i8 64, i8 ptrtoint (i8* @__typeid_bytearray32_align to i8)) to i64)
+  ; CHECK-NEXT: [[or:%.*]] = or i64 [[lshr]], [[shl]]
+  ; CHECK-NEXT: [[ule:%.*]] = icmp ule i64 [[or]], ptrtoint (i8* @__typeid_bytearray32_size_m1 to i64)
+  ; CHECK-NEXT: br i1 [[ule]], label %[[t:.*]], label %[[f:.*]]
+
+  ; CHECK: [[t]]:
+  ; CHECK-NEXT: [[gep:%.*]] = getelementptr i8, i8* @__typeid_bytearray32_byte_array, i64 [[or]]
+  ; CHECK-NEXT: [[load:%.*]] = load i8, i8* [[gep]]
+  ; CHECK-NEXT: [[and:%.*]] = and i8 [[load]], ptrtoint (i8* @__typeid_bytearray32_bit_mask to i8)
+  ; CHECK-NEXT: [[ne:%.*]] = icmp ne i8 [[and]], 0
+  ; CHECK-NEXT: br label %[[f]]
+
+  ; CHECK: [[f]]:
+  ; CHECK-NEXT: [[phi:%.*]] = phi i1 [ false, %0 ], [ [[ne]], %[[t]] ]
+  ; CHECK-NEXT: ret i1 [[phi]]
+  %x = call i1 @llvm.type.test(i8* %p, metadata !"bytearray32")
+  ret i1 %x
+}
+
+; CHECK: define i1 @inline5(i8* [[p:%.*]])
+define i1 @inline5(i8* %p) {
+  ; CHECK-NEXT: [[pi:%.*]] = ptrtoint i8* [[p]] to i64
+  ; CHECK-NEXT: [[sub:%.*]] = sub i64 [[pi]], ptrtoint (i8* @__typeid_inline5_global_addr to i64)
+  ; CHECK-NEXT: [[lshr:%.*]] = lshr i64 [[sub]], zext (i8 ptrtoint (i8* @__typeid_inline5_align to i8) to i64)
+  ; CHECK-NEXT: [[shl:%.*]] = shl i64 [[sub]], zext (i8 sub (i8 64, i8 ptrtoint (i8* @__typeid_inline5_align to i8)) to i64)
+  ; CHECK-NEXT: [[or:%.*]] = or i64 [[lshr]], [[shl]]
+  ; CHECK-NEXT: [[ule:%.*]] = icmp ule i64 [[or]], ptrtoint (i8* @__typeid_inline5_size_m1 to i64)
+  ; CHECK-NEXT: br i1 [[ule]], label %[[t:.*]], label %[[f:.*]]
+
+  ; CHECK: [[t]]:
+  ; CHECK-NEXT: [[trunc:%.*]] = trunc i64 [[or]] to i32
+  ; CHECK-NEXT: [[and:%.*]] = and i32 [[trunc]], 31
+  ; CHECK-NEXT: [[shl2:%.*]] = shl i32 1, [[and]]
+  ; CHECK-NEXT: [[and2:%.*]] = and i32 ptrtoint (i8* @__typeid_inline5_inline_bits to i32), [[shl2]]
+  ; CHECK-NEXT: [[ne:%.*]] = icmp ne i32 [[and2]], 0
+  ; CHECK-NEXT: br label %[[f]]
+
+  ; CHECK: [[f]]:
+  ; CHECK-NEXT: [[phi:%.*]] = phi i1 [ false, %0 ], [ [[ne]], %[[t]] ]
+  ; CHECK-NEXT: ret i1 [[phi]]
+  %x = call i1 @llvm.type.test(i8* %p, metadata !"inline5")
+  ret i1 %x
+}
+
+; CHECK: define i1 @inline6(i8* [[p:%.*]])
+define i1 @inline6(i8* %p) {
+  ; CHECK-NEXT: [[pi:%.*]] = ptrtoint i8* [[p]] to i64
+  ; CHECK-NEXT: [[sub:%.*]] = sub i64 [[pi]], ptrtoint (i8* @__typeid_inline6_global_addr to i64)
+  ; CHECK-NEXT: [[lshr:%.*]] = lshr i64 [[sub]], zext (i8 ptrtoint (i8* @__typeid_inline6_align to i8) to i64)
+  ; CHECK-NEXT: [[shl:%.*]] = shl i64 [[sub]], zext (i8 sub (i8 64, i8 ptrtoint (i8* @__typeid_inline6_align to i8)) to i64)
+  ; CHECK-NEXT: [[or:%.*]] = or i64 [[lshr]], [[shl]]
+  ; CHECK-NEXT: [[ule:%.*]] = icmp ule i64 [[or]], ptrtoint (i8* @__typeid_inline6_size_m1 to i64)
+  ; CHECK-NEXT: br i1 [[ule]], label %[[t:.*]], label %[[f:.*]]
+
+  ; CHECK: [[t]]:
+  ; CHECK-NEXT: [[and:%.*]] = and i64 [[or]], 63
+  ; CHECK-NEXT: [[shl2:%.*]] = shl i64 1, [[and]]
+  ; CHECK-NEXT: [[and2:%.*]] = and i64 ptrtoint (i8* @__typeid_inline6_inline_bits to i64), [[shl2]]
+  ; CHECK-NEXT: [[ne:%.*]] = icmp ne i64 [[and2]], 0
+  ; CHECK-NEXT: br label %[[f]]
+
+  ; CHECK: [[f]]:
+  ; CHECK-NEXT: [[phi:%.*]] = phi i1 [ false, %0 ], [ [[ne]], %[[t]] ]
+  ; CHECK-NEXT: ret i1 [[phi]]
+  %x = call i1 @llvm.type.test(i8* %p, metadata !"inline6")
+  ret i1 %x
+}
+
+; CHECK: define i1 @single(i8* [[p:%.*]])
+define i1 @single(i8* %p) {
+  ; CHECK-NEXT: [[pi:%.*]] = ptrtoint i8* [[p]] to i64
+  ; CHECK-NEXT: [[eq:%.*]] = icmp eq i64 [[pi]], ptrtoint (i8* @__typeid_single_global_addr to i64)
+  ; CHECK-NEXT: ret i1 [[eq]]
+  %x = call i1 @llvm.type.test(i8* %p, metadata !"single")
+  ret i1 %x
+}
+
+; CHECK: !0 = !{i64 0, i64 256}
+; CHECK: !1 = !{i64 0, i64 64}
+; CHECK: !2 = !{i64 -1, i64 -1}
+; CHECK: !3 = !{i64 0, i64 32}
+; CHECK: !4 = !{i64 0, i64 4294967296}
+; CHECK: !5 = !{i64 0, i64 128}
diff --git a/test/Transforms/Mem2Reg/ignore-lifetime.ll b/test/Transforms/Mem2Reg/ignore-lifetime.ll
index 12adaffc7714..b996a659237a 100644
--- a/test/Transforms/Mem2Reg/ignore-lifetime.ll
+++ b/test/Transforms/Mem2Reg/ignore-lifetime.ll
@@ -1,16 +1,16 @@
 ; RUN: opt -mem2reg -S -o - < %s | FileCheck %s
 
-declare void @llvm.lifetime.start(i64 %size, i8* nocapture %ptr)
-declare void @llvm.lifetime.end(i64 %size, i8* nocapture %ptr)
+declare void @llvm.lifetime.start.p0i8(i64 %size, i8* nocapture %ptr)
+declare void @llvm.lifetime.end.p0i8(i64 %size, i8* nocapture %ptr)
 
 define void @test1() {
 ; CHECK: test1
 ; CHECK-NOT: alloca
   %A = alloca i32
   %B = bitcast i32* %A to i8*
-  call void @llvm.lifetime.start(i64 2, i8* %B)
+  call void @llvm.lifetime.start.p0i8(i64 2, i8* %B)
   store i32 1, i32* %A
-  call void @llvm.lifetime.end(i64 2, i8* %B)
+  call void @llvm.lifetime.end.p0i8(i64 2, i8* %B)
   ret void
 }
 
@@ -19,8 +19,8 @@ define void @test2() {
 ; CHECK-NOT: alloca
   %A = alloca {i8, i16}
   %B = getelementptr {i8, i16}, {i8, i16}* %A, i32 0, i32 0
-  call void @llvm.lifetime.start(i64 2, i8* %B)
+  call void @llvm.lifetime.start.p0i8(i64 2, i8* %B)
   store {i8, i16} zeroinitializer, {i8, i16}* %A
-  call void @llvm.lifetime.end(i64 2, i8* %B)
+  call void @llvm.lifetime.end.p0i8(i64 2, i8* %B)
   ret void
 }
diff --git a/test/Transforms/Mem2Reg/preserve-nonnull-load-metadata.ll b/test/Transforms/Mem2Reg/preserve-nonnull-load-metadata.ll
new file mode 100644
index 000000000000..33a5b124c555
--- /dev/null
+++ b/test/Transforms/Mem2Reg/preserve-nonnull-load-metadata.ll
@@ -0,0 +1,89 @@
+; RUN: opt < %s -mem2reg -S | FileCheck %s
+
+; This tests that mem2reg preserves the !nonnull metadata on loads
+; from allocas that get optimized out.
+
+; Check the case where the alloca in question has a single store.
+define float* @single_store(float** %arg) {
+; CHECK-LABEL: define float* @single_store
+; CHECK: %arg.load = load float*, float** %arg, align 8
+; CHECK: [[ASSUME:%(.*)]] = icmp ne float* %arg.load, null
+; CHECK: call void @llvm.assume(i1 {{.*}}[[ASSUME]])
+; CHECK: ret float* %arg.load
+entry:
+  %buf = alloca float*
+  %arg.load = load float*, float** %arg, align 8
+  store float* %arg.load, float** %buf, align 8
+  %buf.load = load float*, float **%buf, !nonnull !0
+  ret float* %buf.load
+}
+
+; Check the case where the alloca in question has more than one
+; store but still within one basic block.
+define float* @single_block(float** %arg) {
+; CHECK-LABEL: define float* @single_block
+; CHECK: %arg.load = load float*, float** %arg, align 8
+; CHECK: [[ASSUME:%(.*)]] = icmp ne float* %arg.load, null
+; CHECK: call void @llvm.assume(i1 {{.*}}[[ASSUME]])
+; CHECK: ret float* %arg.load
+entry:
+  %buf = alloca float*
+  %arg.load = load float*, float** %arg, align 8
+  store float* null, float** %buf, align 8
+  store float* %arg.load, float** %buf, align 8
+  %buf.load = load float*, float **%buf, !nonnull !0
+  ret float* %buf.load
+}
+
+; Check the case where the alloca in question has more than one
+; store and also reads ands writes in multiple blocks.
+define float* @multi_block(float** %arg) {
+; CHECK-LABEL: define float* @multi_block
+; CHECK-LABEL: entry:
+; CHECK: %arg.load = load float*, float** %arg, align 8
+; CHECK: br label %next
+; CHECK-LABEL: next:
+; CHECK: [[ASSUME:%(.*)]] = icmp ne float* %arg.load, null
+; CHECK: call void @llvm.assume(i1 {{.*}}[[ASSUME]])
+; CHECK: ret float* %arg.load
+entry:
+  %buf = alloca float*
+  %arg.load = load float*, float** %arg, align 8
+  store float* null, float** %buf, align 8
+  br label %next
+next:
+  store float* %arg.load, float** %buf, align 8
+  %buf.load = load float*, float** %buf, !nonnull !0
+  ret float* %buf.load
+}
+
+; Check that we don't add an assume if it's not
+; necessary i.e. the value is already implied to be nonnull
+define float* @no_assume(float** %arg) {
+; CHECK-LABEL: define float* @no_assume
+; CHECK-LABEL: entry:
+; CHECK: %arg.load = load float*, float** %arg, align 8
+; CHECK: %cn = icmp ne float* %arg.load, null
+; CHECK: br i1 %cn, label %next, label %fin
+; CHECK-LABEL: next:
+; CHECK-NOT: call void @llvm.assume
+; CHECK: ret float* %arg.load
+; CHECK-LABEL: fin:
+; CHECK: ret float* null
+entry:
+  %buf = alloca float*
+  %arg.load = load float*, float** %arg, align 8
+  %cn = icmp ne float* %arg.load, null
+  br i1 %cn, label %next, label %fin
+next:
+; At this point the above nonnull check ensures that
+; the value %arg.load is nonnull in this block and thus
+; we need not add the assume.
+  store float* %arg.load, float** %buf, align 8
+  %buf.load = load float*, float** %buf, !nonnull !0
+  ret float* %buf.load
+fin:
+  ret float* null
+}
+
+!0 = !{}
diff --git a/test/Transforms/MemCpyOpt/lifetime.ll b/test/Transforms/MemCpyOpt/lifetime.ll
index 6a7e44692daa..77b495f2b583 100644
--- a/test/Transforms/MemCpyOpt/lifetime.ll
+++ b/test/Transforms/MemCpyOpt/lifetime.ll
@@ -4,8 +4,8 @@
 ; @llvm.lifetime.start and @llvm.memcpy.
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 define void @_ZN4CordC2EOS_(i8* nocapture dereferenceable(16) %arg1) {
 bb:
@@ -14,11 +14,11 @@ bb:
 ; CHECK: ret void
   %tmp = alloca [8 x i8], align 8
   %tmp5 = bitcast [8 x i8]* %tmp to i8*
-  call void @llvm.lifetime.start(i64 16, i8* %tmp5)
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %tmp5)
   %tmp10 = getelementptr inbounds i8, i8* %tmp5, i64 7
   store i8 0, i8* %tmp10, align 1
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arg1, i8* %tmp5, i64 16, i32 8, i1 false)
-  call void @llvm.lifetime.end(i64 16, i8* %tmp5)
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %tmp5)
   ret void
 }
 
diff --git a/test/Transforms/MemCpyOpt/memcpy-to-memset-with-lifetimes.ll b/test/Transforms/MemCpyOpt/memcpy-to-memset-with-lifetimes.ll
index e3e57f09d88f..e21dc87cb6a0 100644
--- a/test/Transforms/MemCpyOpt/memcpy-to-memset-with-lifetimes.ll
+++ b/test/Transforms/MemCpyOpt/memcpy-to-memset-with-lifetimes.ll
@@ -7,11 +7,11 @@ define void @foo([8 x i64]* noalias nocapture sret dereferenceable(64) %sret) {
 entry-block:
   %a = alloca [8 x i64], align 8
   %a.cast = bitcast [8 x i64]* %a to i8*
-  call void @llvm.lifetime.start(i64 64, i8* %a.cast)
+  call void @llvm.lifetime.start.p0i8(i64 64, i8* %a.cast)
   call void @llvm.memset.p0i8.i64(i8* %a.cast, i8 0, i64 64, i32 8, i1 false)
   %sret.cast = bitcast [8 x i64]* %sret to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %sret.cast, i8* %a.cast, i64 64, i32 8, i1 false)
-  call void @llvm.lifetime.end(i64 64, i8* %a.cast)
+  call void @llvm.lifetime.end.p0i8(i64 64, i8* %a.cast)
   ret void
 
 ; CHECK-LABEL: @foo(
@@ -25,14 +25,14 @@ define void @bar([8 x i64]* noalias nocapture sret dereferenceable(64) %sret, [8
 entry-block:
   %a = alloca [8 x i64], align 8
   %a.cast = bitcast [8 x i64]* %a to i8*
-  call void @llvm.lifetime.start(i64 64, i8* %a.cast)
+  call void @llvm.lifetime.start.p0i8(i64 64, i8* %a.cast)
   call void @llvm.memset.p0i8.i64(i8* %a.cast, i8 0, i64 64, i32 8, i1 false)
   %sret.cast = bitcast [8 x i64]* %sret to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %sret.cast, i8* %a.cast, i64 64, i32 8, i1 false)
   call void @llvm.memset.p0i8.i64(i8* %a.cast, i8 42, i64 32, i32 8, i1 false)
   %out.cast = bitcast [8 x i64]* %out to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out.cast, i8* %a.cast, i64 64, i32 8, i1 false)
-  call void @llvm.lifetime.end(i64 64, i8* %a.cast)
+  call void @llvm.lifetime.end.p0i8(i64 64, i8* %a.cast)
   ret void
 
 ; CHECK-LABEL: @bar(
@@ -48,8 +48,8 @@ entry-block:
 ; CHECK: ret void
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) nounwind
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/Transforms/MemCpyOpt/memcpy-undef.ll b/test/Transforms/MemCpyOpt/memcpy-undef.ll
index c75d020c0786..06a41829a4ee 100644
--- a/test/Transforms/MemCpyOpt/memcpy-undef.ll
+++ b/test/Transforms/MemCpyOpt/memcpy-undef.ll
@@ -22,7 +22,7 @@ define i32 @test1(%struct.foo* nocapture %foobie) nounwind noinline ssp uwtable
 }
 
 define void @test2(i8* sret noalias nocapture %out, i8* %in) nounwind noinline ssp uwtable {
-  call void @llvm.lifetime.start(i64 8, i8* %in)
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* %in)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out, i8* %in, i64 8, i32 1, i1 false)
   ret void
 
@@ -32,7 +32,7 @@ define void @test2(i8* sret noalias nocapture %out, i8* %in) nounwind noinline s
 }
 
 define void @test3(i8* sret noalias nocapture %out, i8* %in) nounwind noinline ssp uwtable {
-  call void @llvm.lifetime.start(i64 4, i8* %in)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %in)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out, i8* %in, i64 8, i32 1, i1 false)
   ret void
 
@@ -43,4 +43,4 @@ define void @test3(i8* sret noalias nocapture %out, i8* %in) nounwind noinline s
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
diff --git a/test/Transforms/MemCpyOpt/memcpy.ll b/test/Transforms/MemCpyOpt/memcpy.ll
index 6181543cfc63..e4d50f7157de 100644
--- a/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/test/Transforms/MemCpyOpt/memcpy.ll
@@ -76,8 +76,21 @@ define void @test4(i8 *%P) {
 ; CHECK-NEXT: call void @test4a(
 }
 
+; Make sure we don't remove the memcpy if the source address space doesn't match the byval argument
+define void @test4_addrspace(i8 addrspace(1)* %P) {
+  %A = alloca %1
+  %a = bitcast %1* %A to i8*
+  call void @llvm.memcpy.p0i8.p1i8.i64(i8* %a, i8 addrspace(1)* %P, i64 8, i32 4, i1 false)
+  call void @test4a(i8* align 1 byval %a)
+  ret void
+; CHECK-LABEL: @test4_addrspace(
+; CHECK: call void @llvm.memcpy.p0i8.p1i8.i64(
+; CHECK-NEXT: call void @test4a(
+}
+
 declare void @test4a(i8* align 1 byval)
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind
 declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind
 
 %struct.S = type { i128, [4 x i8]}
@@ -202,6 +215,21 @@ define void @test10(%opaque* noalias nocapture sret %x, i32 %y) {
   ret void
 }
 
+; don't create new addressspacecasts when we don't know they're safe for the target
+define void @test11([20 x i32] addrspace(1)* nocapture dereferenceable(80) %P) {
+  %A = alloca [20 x i32], align 4
+  %a = bitcast [20 x i32]* %A to i8*
+  %b = bitcast [20 x i32] addrspace(1)* %P to i8 addrspace(1)*
+  call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 80, i32 4, i1 false)
+  call void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* %b, i8* %a, i64 80, i32 4, i1 false)
+  ret void
+; CHECK-LABEL: @test11(
+; CHECK-NOT: addrspacecast
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+declare void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
 declare void @f1(%struct.big* nocapture sret)
 declare void @f2(%struct.big*)
 
diff --git a/test/Transforms/MemCpyOpt/pr29105.ll b/test/Transforms/MemCpyOpt/pr29105.ll
index 0d3778372266..03b176c4d245 100644
--- a/test/Transforms/MemCpyOpt/pr29105.ll
+++ b/test/Transforms/MemCpyOpt/pr29105.ll
@@ -11,25 +11,25 @@ entry-block:
   %0 = bitcast [2048 x i64]* %tmp0 to i8*
   %tmp2 = alloca %Foo, align 8
   %x.sroa.0.0..sroa_cast6 = bitcast [2048 x i64]* %x.sroa.0 to i8*
-  call void @llvm.lifetime.start(i64 16384, i8* %x.sroa.0.0..sroa_cast6)
-  call void @llvm.lifetime.start(i64 16384, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 16384, i8* %x.sroa.0.0..sroa_cast6)
+  call void @llvm.lifetime.start.p0i8(i64 16384, i8* %0)
   call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 16384, i32 8, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %x.sroa.0.0..sroa_cast6, i8* %0, i64 16384, i32 8, i1 false)
-  call void @llvm.lifetime.end(i64 16384, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 16384, i8* %0)
   %1 = bitcast %Foo* %tmp2 to i8*
-  call void @llvm.lifetime.start(i64 16384, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 16384, i8* %1)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %x.sroa.0.0..sroa_cast6, i64 16384, i32 8, i1 false)
   call void @bar(%Foo* noalias nocapture nonnull dereferenceable(16384) %tmp2)
-  call void @llvm.lifetime.end(i64 16384, i8* %1)
-  call void @llvm.lifetime.end(i64 16384, i8* %x.sroa.0.0..sroa_cast6)
+  call void @llvm.lifetime.end.p0i8(i64 16384, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 16384, i8* %x.sroa.0.0..sroa_cast6)
   ret void
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1
 
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 declare void @bar(%Foo* noalias nocapture readonly dereferenceable(16384)) unnamed_addr #0
 
diff --git a/test/Transforms/MergeFunc/mergefunc-preserve-debug-info.ll b/test/Transforms/MergeFunc/mergefunc-preserve-debug-info.ll
new file mode 100644
index 000000000000..cf76893d4aab
--- /dev/null
+++ b/test/Transforms/MergeFunc/mergefunc-preserve-debug-info.ll
@@ -0,0 +1,223 @@
+; RUN: opt -O0 -S -mergefunc -mergefunc-preserve-debug-info < %s | FileCheck %s --check-prefix=OPTIMIZATION_LEVEL_0
+; RUN: opt -O2 -S -mergefunc -mergefunc-preserve-debug-info < %s | FileCheck %s --check-prefix=OPTIMIZATION_LEVEL_2
+
+; Preserve debug info in thunks under -mergefunc -mergefunc-preserve-debug-info
+;
+; We test that:
+; At -O0 we have preserved the generated @llvm.dbg.declare debug intrinsics.
+; At -O2 we have preserved the generated @llvm.dbg.value debug intrinsics.
+; At -O0, stores from the incoming parameters to locations on the stack-frame
+;         and allocas that create these locations on the stack-frame are preserved.
+; Debug info got generated for the call made by the thunk and for its return value.
+; The foregoing is the only content of a thunk's entry block.
+; A thunk makes a tail call to the shared implementation.
+; A thunk's call site is preserved to point to the thunk (with only -mergefunc the
+;   call site is modified to point to the shared implementation) when both occur
+;   within the same translation unit.
+
+; The source code that was used to test and generate this LLVM IR is:
+;
+; int maxA(int x, int y) {
+;   int i, m, j;
+;   if (x > y)
+;     m = x;
+;   else
+;     m = y;
+;   return m;
+; }
+;
+; int maxB(int x, int y) {
+;   int i, m, j;
+;   if (x > y)
+;     m = x;
+;   else
+;     m = y;
+;   return m;
+; }
+;
+; void f(void) {
+;
+;   maxA(3, 4);
+;   maxB(1, 9);
+; }
+
+; Function Attrs: nounwind uwtable
+define i32 @maxA(i32 %x, i32 %y) !dbg !6 {
+entry:
+  %x.addr = alloca i32, align 4
+  %y.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  %m = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32 %x, i32* %x.addr, align 4
+  call void @llvm.dbg.declare(metadata i32* %x.addr, metadata !11, metadata !12), !dbg !13
+  store i32 %y, i32* %y.addr, align 4
+  call void @llvm.dbg.declare(metadata i32* %y.addr, metadata !14, metadata !12), !dbg !15
+  call void @llvm.dbg.declare(metadata i32* %i, metadata !16, metadata !12), !dbg !17
+  call void @llvm.dbg.declare(metadata i32* %m, metadata !18, metadata !12), !dbg !19
+  call void @llvm.dbg.declare(metadata i32* %j, metadata !20, metadata !12), !dbg !21
+  %0 = load i32, i32* %x.addr, align 4, !dbg !22
+  %1 = load i32, i32* %y.addr, align 4, !dbg !24
+  %cmp = icmp sgt i32 %0, %1, !dbg !25
+  br i1 %cmp, label %if.then, label %if.else, !dbg !26
+
+if.then:                                          ; preds = %entry
+  %2 = load i32, i32* %x.addr, align 4, !dbg !27
+  store i32 %2, i32* %m, align 4, !dbg !28
+  br label %if.end, !dbg !29
+
+if.else:                                          ; preds = %entry
+  %3 = load i32, i32* %y.addr, align 4, !dbg !30
+  store i32 %3, i32* %m, align 4, !dbg !31
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %4 = load i32, i32* %m, align 4, !dbg !32
+  ret i32 %4, !dbg !33
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+; Function Attrs: nounwind uwtable
+define i32 @maxB(i32 %x, i32 %y) !dbg !34 {
+
+; OPTIMIZATION_LEVEL_0: define i32 @maxB(i32 %x, i32 %y)
+; OPTIMIZATION_LEVEL_0-NEXT: entry:
+; OPTIMIZATION_LEVEL_0-NEXT: %x.addr = alloca i32, align 4
+; OPTIMIZATION_LEVEL_0-NEXT: %y.addr = alloca i32, align 4
+; OPTIMIZATION_LEVEL_0-NEXT: store i32 %x, i32* %x.addr, align 4
+; OPTIMIZATION_LEVEL_0-NEXT: call void @llvm.dbg.declare(metadata i32* %x.addr, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}}
+; OPTIMIZATION_LEVEL_0-NEXT: store i32 %y, i32* %y.addr, align 4
+; OPTIMIZATION_LEVEL_0-NEXT: call void @llvm.dbg.declare(metadata i32* %y.addr, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}}
+; OPTIMIZATION_LEVEL_0-NEXT: %0 = tail call i32 @maxA(i32 %x, i32 %y), !dbg !{{[0-9]+}}
+; OPTIMIZATION_LEVEL_0-NEXT: ret i32 %0, !dbg !{{[0-9]+}}
+; OPTIMIZATION_LEVEL_0-NEXT: }
+
+; OPTIMIZATION_LEVEL_2: define i32 @maxB(i32 %x, i32 %y)
+; OPTIMIZATION_LEVEL_2-NEXT: entry:
+; OPTIMIZATION_LEVEL_2-NEXT: tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}}
+; OPTIMIZATION_LEVEL_2-NEXT: tail call void @llvm.dbg.value(metadata i32 %y, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}}
+; OPTIMIZATION_LEVEL_2-NEXT: %0 = tail call i32 @maxA(i32 %x, i32 %y) #{{[0-9]+}}, !dbg !{{[0-9]+}}
+; OPTIMIZATION_LEVEL_2-NEXT: ret i32 %0, !dbg !{{[0-9]+}}
+; OPTIMIZATION_LEVEL_2-NEXT: }
+
+entry:
+  %x.addr = alloca i32, align 4
+  %y.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  %m = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32 %x, i32* %x.addr, align 4
+  call void @llvm.dbg.declare(metadata i32* %x.addr, metadata !35, metadata !12), !dbg !36
+  store i32 %y, i32* %y.addr, align 4
+  call void @llvm.dbg.declare(metadata i32* %y.addr, metadata !37, metadata !12), !dbg !38
+  call void @llvm.dbg.declare(metadata i32* %i, metadata !39, metadata !12), !dbg !40
+  call void @llvm.dbg.declare(metadata i32* %m, metadata !41, metadata !12), !dbg !42
+  call void @llvm.dbg.declare(metadata i32* %j, metadata !43, metadata !12), !dbg !44
+  %0 = load i32, i32* %x.addr, align 4, !dbg !45
+  %1 = load i32, i32* %y.addr, align 4, !dbg !47
+  %cmp = icmp sgt i32 %0, %1, !dbg !48
+  br i1 %cmp, label %if.then, label %if.else, !dbg !49
+
+if.then:                                          ; preds = %entry
+  %2 = load i32, i32* %x.addr, align 4, !dbg !50
+  store i32 %2, i32* %m, align 4, !dbg !51
+  br label %if.end, !dbg !52
+
+if.else:                                          ; preds = %entry
+  %3 = load i32, i32* %y.addr, align 4, !dbg !53
+  store i32 %3, i32* %m, align 4, !dbg !54
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %4 = load i32, i32* %m, align 4, !dbg !55
+  ret i32 %4, !dbg !56
+}
+
+; Function Attrs: nounwind uwtable
+define void @f() !dbg !57 {
+entry:
+
+; OPTIMIZATION_LEVEL_0: define void @f()
+; OPTIMIZATION_LEVEL_0-NEXT: entry:
+; OPTIMIZATION_LEVEL_0-NEXT: %call = call i32 @maxA(i32 3, i32 4), !dbg !{{[0-9]+}}
+; OPTIMIZATION_LEVEL_0-NEXT: %call1 = call i32 @maxB(i32 1, i32 9), !dbg !{{[0-9]+}}
+; OPTIMIZATION_LEVEL_0-NEXT: ret void, !dbg !{{[0-9]+}}
+
+; OPTIMIZATION_LEVEL_2: define void @f()
+; OPTIMIZATION_LEVEL_2-NEXT: entry:
+; OPTIMIZATION_LEVEL_2-NEXT: ret void, !dbg !{{[0-9]+}}
+
+  %call = call i32 @maxA(i32 3, i32 4), !dbg !60
+  %call1 = call i32 @maxB(i32 1, i32 9), !dbg !61
+  ret void, !dbg !62
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "mergefunc-preserve-debug-info.c", directory: "")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!""}
+!6 = distinct !DISubprogram(name: "maxA", scope: !7, file: !7, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!7 = !DIFile(filename: "./mergefunc-preserve-debug-info.c", directory: "")
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10, !10, !10}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !DILocalVariable(name: "x", arg: 1, scope: !6, file: !7, line: 1, type: !10)
+!12 = !DIExpression()
+!13 = !DILocation(line: 1, column: 14, scope: !6)
+!14 = !DILocalVariable(name: "y", arg: 2, scope: !6, file: !7, line: 1, type: !10)
+!15 = !DILocation(line: 1, column: 21, scope: !6)
+!16 = !DILocalVariable(name: "i", scope: !6, file: !7, line: 2, type: !10)
+!17 = !DILocation(line: 2, column: 7, scope: !6)
+!18 = !DILocalVariable(name: "m", scope: !6, file: !7, line: 2, type: !10)
+!19 = !DILocation(line: 2, column: 10, scope: !6)
+!20 = !DILocalVariable(name: "j", scope: !6, file: !7, line: 2, type: !10)
+!21 = !DILocation(line: 2, column: 13, scope: !6)
+!22 = !DILocation(line: 3, column: 7, scope: !23)
+!23 = distinct !DILexicalBlock(scope: !6, file: !7, line: 3, column: 7)
+!24 = !DILocation(line: 3, column: 11, scope: !23)
+!25 = !DILocation(line: 3, column: 9, scope: !23)
+!26 = !DILocation(line: 3, column: 7, scope: !6)
+!27 = !DILocation(line: 4, column: 9, scope: !23)
+!28 = !DILocation(line: 4, column: 7, scope: !23)
+!29 = !DILocation(line: 4, column: 5, scope: !23)
+!30 = !DILocation(line: 6, column: 9, scope: !23)
+!31 = !DILocation(line: 6, column: 7, scope: !23)
+!32 = !DILocation(line: 7, column: 10, scope: !6)
+!33 = !DILocation(line: 7, column: 3, scope: !6)
+!34 = distinct !DISubprogram(name: "maxB", scope: !7, file: !7, line: 10, type: !8, isLocal: false, isDefinition: true, scopeLine: 10, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!35 = !DILocalVariable(name: "x", arg: 1, scope: !34, file: !7, line: 10, type: !10)
+!36 = !DILocation(line: 10, column: 14, scope: !34)
+!37 = !DILocalVariable(name: "y", arg: 2, scope: !34, file: !7, line: 10, type: !10)
+!38 = !DILocation(line: 10, column: 21, scope: !34)
+!39 = !DILocalVariable(name: "i", scope: !34, file: !7, line: 11, type: !10)
+!40 = !DILocation(line: 11, column: 7, scope: !34)
+!41 = !DILocalVariable(name: "m", scope: !34, file: !7, line: 11, type: !10)
+!42 = !DILocation(line: 11, column: 10, scope: !34)
+!43 = !DILocalVariable(name: "j", scope: !34, file: !7, line: 11, type: !10)
+!44 = !DILocation(line: 11, column: 13, scope: !34)
+!45 = !DILocation(line: 12, column: 7, scope: !46)
+!46 = distinct !DILexicalBlock(scope: !34, file: !7, line: 12, column: 7)
+!47 = !DILocation(line: 12, column: 11, scope: !46)
+!48 = !DILocation(line: 12, column: 9, scope: !46)
+!49 = !DILocation(line: 12, column: 7, scope: !34)
+!50 = !DILocation(line: 13, column: 9, scope: !46)
+!51 = !DILocation(line: 13, column: 7, scope: !46)
+!52 = !DILocation(line: 13, column: 5, scope: !46)
+!53 = !DILocation(line: 15, column: 9, scope: !46)
+!54 = !DILocation(line: 15, column: 7, scope: !46)
+!55 = !DILocation(line: 16, column: 10, scope: !34)
+!56 = !DILocation(line: 16, column: 3, scope: !34)
+!57 = distinct !DISubprogram(name: "f", scope: !7, file: !7, line: 19, type: !58, isLocal: false, isDefinition: true, scopeLine: 19, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!58 = !DISubroutineType(types: !59)
+!59 = !{null}
+!60 = !DILocation(line: 21, column: 3, scope: !57)
+!61 = !DILocation(line: 22, column: 3, scope: !57)
+!62 = !DILocation(line: 23, column: 1, scope: !57)
diff --git a/test/Transforms/MetaRenamer/metarenamer.ll b/test/Transforms/MetaRenamer/metarenamer.ll
index 213fbe3bbff7..7b527ae54cb1 100644
--- a/test/Transforms/MetaRenamer/metarenamer.ll
+++ b/test/Transforms/MetaRenamer/metarenamer.ll
@@ -96,3 +96,18 @@ define i32 @varargs_func_6_xxx(i32 %arg_1_xxx, i32 %arg_2_xxx, ...) nounwind uwt
   store i32 %arg_2_xxx, i32* %2, align 4
   ret i32 6
 }
+
+declare noalias i8* @malloc(i32)
+declare void @free(i8* nocapture)
+
+define void @dont_rename_lib_funcs() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = call i8* @malloc(i32 23)
+; CHECK-NEXT:    call void @free(i8* [[TMP]])
+; CHECK-NEXT:    ret void
+;
+  %x = call i8* @malloc(i32 23)
+  call void @free(i8* %x)
+  ret void
+}
diff --git a/test/Transforms/NewGVN/2007-07-26-PhiErasure.ll b/test/Transforms/NewGVN/2007-07-26-PhiErasure.ll
index 402de50c72cf..27a798bf7dd1 100644
--- a/test/Transforms/NewGVN/2007-07-26-PhiErasure.ll
+++ b/test/Transforms/NewGVN/2007-07-26-PhiErasure.ll
@@ -1,4 +1,4 @@
-; XFAIL: *
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -newgvn -S | FileCheck %s
 
 	%struct..0anon = type { i32 }
@@ -9,37 +9,34 @@
 @n_spills = external global i32		; <i32*> [#uses=2]
 
 define i32 @reload(%struct.rtx_def* %first, i32 %global, %struct.FILE* %dumpfile) {
+; CHECK-LABEL: @reload(
+; CHECK-NEXT:  cond_next2835.1:
+; CHECK-NEXT:    br label [[BB2928:%.*]]
+; CHECK:       bb2928:
+; CHECK-NEXT:    br i1 false, label [[COND_NEXT2943:%.*]], label [[COND_TRUE2935:%.*]]
+; CHECK:       cond_true2935:
+; CHECK-NEXT:    br label [[COND_NEXT2943]]
+; CHECK:       cond_next2943:
+; CHECK-NEXT:    br i1 false, label [[BB2982_PREHEADER:%.*]], label [[BB2928]]
+; CHECK:       bb2982.preheader:
+; CHECK-NEXT:    store i8 undef, i8* null
+; CHECK-NEXT:    ret i32 undef
+;
 cond_next2835.1:		; preds = %cond_next2861
-	%tmp2922 = load i32, i32* @n_spills, align 4		; <i32> [#uses=0]
-	br label %bb2928
+  %tmp2922 = load i32, i32* @n_spills, align 4		; <i32> [#uses=0]
+  br label %bb2928
 
 bb2928:		; preds = %cond_next2835.1, %cond_next2943
-	br i1 false, label %cond_next2943, label %cond_true2935
+  br i1 false, label %cond_next2943, label %cond_true2935
 
 cond_true2935:		; preds = %bb2928
-	br label %cond_next2943
+  br label %cond_next2943
 
 cond_next2943:		; preds = %cond_true2935, %bb2928
-	br i1 false, label %bb2982.preheader, label %bb2928
+  br i1 false, label %bb2982.preheader, label %bb2928
 
 bb2982.preheader:		; preds = %cond_next2943
-	%tmp298316 = load i32, i32* @n_spills, align 4		; <i32> [#uses=0]
-	ret i32 %tmp298316
+  %tmp298316 = load i32, i32* @n_spills, align 4		; <i32> [#uses=0]
+  ret i32 %tmp298316
 
 }
-
-; CHECK: define i32 @reload(%struct.rtx_def* %first, i32 %global, %struct.FILE* %dumpfile) {
-; CHECK-NEXT: cond_next2835.1:
-; CHECK-NEXT:   br label %bb2928
-; CHECK: bb2928:
-; CHECK-NEXT:   br i1 false, label %bb2928.cond_next2943_crit_edge, label %cond_true2935
-; CHECK: bb2928.cond_next2943_crit_edge:
-; CHECK-NEXT:   br label %cond_next2943
-; CHECK: cond_true2935:
-; CHECK-NEXT:   br label %cond_next2943
-; CHECK: cond_next2943:
-; CHECK-NEXT:   br i1 false, label %bb2982.preheader, label %bb2928
-; CHECK: bb2982.preheader:
-; CHECK-NEXT:   %tmp298316 = load i32, i32* @n_spills, align 4
-; CHECK-NEXT:   ret i32 %tmp298316
-; CHECK-NEXT: }
diff --git a/test/Transforms/NewGVN/2011-07-07-MatchIntrinsicExtract.ll b/test/Transforms/NewGVN/2011-07-07-MatchIntrinsicExtract.ll
index 4b47b06f1657..86c80d1d5f21 100644
--- a/test/Transforms/NewGVN/2011-07-07-MatchIntrinsicExtract.ll
+++ b/test/Transforms/NewGVN/2011-07-07-MatchIntrinsicExtract.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; RUN: opt < %s -newgvn -S | FileCheck %s
 ;
 
@@ -9,7 +8,8 @@ entry:
   %uadd = tail call %0 @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %uadd.0 = extractvalue %0 %uadd, 0
   %add1 = add i64 %a, %b
-  ret i64 %add1
+  %add2 =  add i64 %add1, %uadd.0
+  ret i64 %add2
 }
 
 ; CHECK-LABEL: @test1(
@@ -21,7 +21,8 @@ entry:
   %usub = tail call %0 @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
   %usub.0 = extractvalue %0 %usub, 0
   %sub1 = sub i64 %a, %b
-  ret i64 %sub1
+  %add2 =  add i64 %sub1, %usub.0
+  ret i64 %add2
 }
 
 ; CHECK-LABEL: @test2(
@@ -33,7 +34,8 @@ entry:
   %umul = tail call %0 @llvm.umul.with.overflow.i64(i64 %a, i64 %b)
   %umul.0 = extractvalue %0 %umul, 0
   %mul1 = mul i64 %a, %b
-  ret i64 %mul1
+  %add2 =  add i64 %mul1, %umul.0
+  ret i64 %add2
 }
 
 ; CHECK-LABEL: @test3(
@@ -45,7 +47,8 @@ entry:
   %sadd = tail call %0 @llvm.sadd.with.overflow.i64(i64 %a, i64 %b)
   %sadd.0 = extractvalue %0 %sadd, 0
   %add1 = add i64 %a, %b
-  ret i64 %add1
+  %add2 =  add i64 %add1, %sadd.0
+  ret i64 %add2
 }
 
 ; CHECK-LABEL: @test4(
@@ -57,7 +60,8 @@ entry:
   %ssub = tail call %0 @llvm.ssub.with.overflow.i64(i64 %a, i64 %b)
   %ssub.0 = extractvalue %0 %ssub, 0
   %sub1 = sub i64 %a, %b
-  ret i64 %sub1
+  %add2 =  add i64 %sub1, %ssub.0
+  ret i64 %add2
 }
 
 ; CHECK-LABEL: @test5(
@@ -69,7 +73,8 @@ entry:
   %smul = tail call %0 @llvm.smul.with.overflow.i64(i64 %a, i64 %b)
   %smul.0 = extractvalue %0 %smul, 0
   %mul1 = mul i64 %a, %b
-  ret i64 %mul1
+  %add2 =  add i64 %mul1, %smul.0
+  ret i64 %add2
 }
 
 ; CHECK-LABEL: @test6(
diff --git a/test/Transforms/NewGVN/basic-cyclic-opt.ll b/test/Transforms/NewGVN/basic-cyclic-opt.ll
index 523ed2612e3c..7830d7ea78a5 100644
--- a/test/Transforms/NewGVN/basic-cyclic-opt.ll
+++ b/test/Transforms/NewGVN/basic-cyclic-opt.ll
@@ -169,7 +169,6 @@ define i32 @vnum_test3(i32* %data) #0 {
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp slt i32 [[I_0]], 30
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[BB11:%.*]], label [[BB14:%.*]]
 ; CHECK:       bb11:
-; CHECK-NEXT:    store i32 0, i32* [[TMP9]], align 4
 ; CHECK-NEXT:    br label [[BB14]]
 ; CHECK:       bb14:
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 1
@@ -228,6 +227,87 @@ bb23:                                             ; preds = %bb4
   ret i32 %p.0
 }
 
+;; This is an irreducible test case that will cause a memoryphi node loop
+;; in the two blocks.
+;; It's equivalent to something like
+;; *a = 0
+;; if (<....>) goto loopmiddle
+;; loopstart:
+;; loopmiddle:
+;; load *a
+;; *a = 0
+;; if (<....>) goto loopstart otherwise goto loopend
+;; loopend:
+;; load *a
+;; add the results of the loads
+;; return them
+;;
+;; Both loads should equal 0, but it requires being
+;; completely optimistic about MemoryPhis, otherwise
+;; we will not be able to see through the cycle.
+define i8 @irreducible_memoryphi(i8* noalias %arg, i8* noalias %arg2) {
+; CHECK-LABEL: @irreducible_memoryphi(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    store i8 0, i8* [[ARG:%.*]]
+; CHECK-NEXT:    br i1 undef, label [[BB2:%.*]], label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br i1 undef, label [[BB1]], label [[BB3:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    ret i8 0
+;
+bb:
+  store i8 0, i8 *%arg
+  br i1 undef, label %bb2, label %bb1
+
+bb1:                                              ; preds = %bb2, %bb
+  br label %bb2
+
+bb2:                                              ; preds = %bb1, %bb
+  %tmp2 = load i8, i8* %arg
+  store i8 0, i8 *%arg
+  br i1 undef, label %bb1, label %bb3
+
+bb3:                                              ; preds = %bb2
+  %tmp = load i8, i8* %arg
+  %tmp3 = add i8 %tmp, %tmp2
+  ret i8 %tmp3
+}
+;; This is an irreducible test case that will cause a phi node loop
+;; in the two blocks
+;;
+;; It should return 0, but it requires being
+;; completely optimistic about phis, otherwise
+;; we will not be able to see through the cycle.
+define i32 @irreducible_phi(i32 %arg) {
+; CHECK-LABEL: @irreducible_phi(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br i1 undef, label [[BB2:%.*]], label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br i1 undef, label [[BB1]], label [[BB3:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    ret i32 0
+;
+bb:
+  %tmp = add i32 0, %arg
+  br i1 undef, label %bb2, label %bb1
+
+bb1:                                              ; preds = %bb2, %bb
+  %phi1 = phi i32 [%tmp, %bb], [%phi2, %bb2]
+  br label %bb2
+
+bb2:                                              ; preds = %bb1, %bb
+  %phi2 = phi i32 [%tmp, %bb], [%phi1, %bb1]
+  br i1 undef, label %bb1, label %bb3
+
+bb3:                                              ; preds = %bb2
+  ; This should be zero
+  %tmp3 = sub i32 %tmp, %phi2
+  ret i32 %tmp3
+}
 attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.ident = !{!0, !0, !0}
diff --git a/test/Transforms/NewGVN/bitcast-of-call.ll b/test/Transforms/NewGVN/bitcast-of-call.ll
index 7b25038275b5..2b817fbcd01c 100644
--- a/test/Transforms/NewGVN/bitcast-of-call.ll
+++ b/test/Transforms/NewGVN/bitcast-of-call.ll
@@ -1,14 +1,20 @@
-; XFAIL: *
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -newgvn -S | FileCheck %s
 ; PR2213
 
 define i32* @f(i8* %x) {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = call i8* @m(i32 12)
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP]] to i32*
+; CHECK-NEXT:    ret i32* [[TMP1]]
+;
 entry:
-        %tmp = call i8* @m( i32 12 )            ; <i8*> [#uses=2]
-        %tmp1 = bitcast i8* %tmp to i32*                ; <i32*> [#uses=0]
-        %tmp2 = bitcast i8* %tmp to i32*                ; <i32*> [#uses=0]
-; CHECK-NOT: %tmp2
-        ret i32* %tmp2
+  %tmp = call i8* @m( i32 12 )            ; <i8*> [#uses=2]
+  %tmp1 = bitcast i8* %tmp to i32*                ; <i32*> [#uses=0]
+  %tmp3 = bitcast i32* %tmp1 to i8*
+  %tmp2 = bitcast i8* %tmp3 to i32*                ; <i32*> [#uses=0]
+  ret i32* %tmp2
 }
 
 declare i8* @m(i32)
diff --git a/test/Transforms/NewGVN/calloc-load-removal.ll b/test/Transforms/NewGVN/calloc-load-removal.ll
index e6870442064b..cdeb971a23e2 100644
--- a/test/Transforms/NewGVN/calloc-load-removal.ll
+++ b/test/Transforms/NewGVN/calloc-load-removal.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; RUN: opt -S -basicaa -newgvn < %s | FileCheck %s
 ; RUN: opt -S -basicaa -newgvn -disable-simplify-libcalls < %s | FileCheck %s -check-prefix=CHECK_NO_LIBCALLS
 ; Check that loads from calloc are recognized as being zero.
diff --git a/test/Transforms/NewGVN/calls-nonlocal.ll b/test/Transforms/NewGVN/calls-nonlocal.ll
index 292060db812e..6e918050d591 100644
--- a/test/Transforms/NewGVN/calls-nonlocal.ll
+++ b/test/Transforms/NewGVN/calls-nonlocal.ll
@@ -1,4 +1,6 @@
 ; XFAIL: *
+;; NewGVN zaps the strlens, but currently takes two iterations to evaluate the conditions, because
+;; we prune predicateinfo, and the icmps only become equivalent after the strlens are zapped
 ; Two occurrences of strlen should be zapped.
 ; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/test/Transforms/NewGVN/cond_br2.ll b/test/Transforms/NewGVN/cond_br2.ll
index e511ff7ed514..ff7a76d14695 100644
--- a/test/Transforms/NewGVN/cond_br2.ll
+++ b/test/Transforms/NewGVN/cond_br2.ll
@@ -19,7 +19,7 @@ define void @_Z4testv() #0 personality i8* bitcast (i32 (...)* @__gxx_personalit
 entry:
   %sv = alloca %"class.llvm::SmallVector", align 16
   %0 = bitcast %"class.llvm::SmallVector"* %sv to i8*
-  call void @llvm.lifetime.start(i64 64, i8* %0) #1
+  call void @llvm.lifetime.start.p0i8(i64 64, i8* %0) #1
   %BeginX.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", %"class.llvm::SmallVector"* %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 0
   %FirstEl.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", %"class.llvm::SmallVector"* %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 3
   %1 = bitcast %"union.llvm::SmallVectorBase::U"* %FirstEl.i.i.i.i.i.i to i8*
@@ -95,7 +95,7 @@ if.then.i.i.i20:                                  ; preds = %invoke.cont3
   br label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21
 
 _ZN4llvm11SmallVectorIiLj8EED1Ev.exit21:          ; preds = %invoke.cont3, %if.then.i.i.i20
-  call void @llvm.lifetime.end(i64 64, i8* %0) #1
+  call void @llvm.lifetime.end.p0i8(i64 64, i8* %0) #1
   ret void
 
 lpad:                                             ; preds = %if.end.i14, %if.end.i, %invoke.cont2
@@ -114,14 +114,14 @@ eh.resume:                                        ; preds = %if.then.i.i.i, %lpa
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 
 declare i32 @__gxx_personality_v0(...)
 
 declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(%"class.llvm::SmallVector"*) #2
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(%"class.llvm::SmallVectorBase"*, i64, i64) #2
 
diff --git a/test/Transforms/NewGVN/condprop-xfail.ll b/test/Transforms/NewGVN/condprop-xfail.ll
new file mode 100644
index 000000000000..5c049617f875
--- /dev/null
+++ b/test/Transforms/NewGVN/condprop-xfail.ll
@@ -0,0 +1,123 @@
+; XFAIL: *
+; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
+
+@a = external global i32		; <i32*> [#uses=7]
+
+;; NewGVN takes two passes to get this, because we prune predicateinfo
+; CHECK-LABEL: @test1(
+define i32 @test1() nounwind {
+entry:
+	%0 = load i32, i32* @a, align 4
+	%1 = icmp eq i32 %0, 4
+	br i1 %1, label %bb, label %bb1
+
+bb:		; preds = %entry
+	br label %bb8
+
+bb1:		; preds = %entry
+	%2 = load i32, i32* @a, align 4
+	%3 = icmp eq i32 %2, 5
+	br i1 %3, label %bb2, label %bb3
+
+bb2:		; preds = %bb1
+	br label %bb8
+
+bb3:		; preds = %bb1
+	%4 = load i32, i32* @a, align 4
+	%5 = icmp eq i32 %4, 4
+; CHECK: br i1 false, label %bb4, label %bb5
+	br i1 %5, label %bb4, label %bb5
+
+bb4:		; preds = %bb3
+	%6 = load i32, i32* @a, align 4
+	%7 = add i32 %6, 5
+	br label %bb8
+
+bb5:		; preds = %bb3
+	%8 = load i32, i32* @a, align 4
+	%9 = icmp eq i32 %8, 5
+; CHECK: br i1 false, label %bb6, label %bb7
+	br i1 %9, label %bb6, label %bb7
+
+bb6:		; preds = %bb5
+	%10 = load i32, i32* @a, align 4
+	%11 = add i32 %10, 4
+	br label %bb8
+
+bb7:		; preds = %bb5
+	%12 = load i32, i32* @a, align 4
+	br label %bb8
+
+bb8:		; preds = %bb7, %bb6, %bb4, %bb2, %bb
+	%.0 = phi i32 [ %12, %bb7 ], [ %11, %bb6 ], [ %7, %bb4 ], [ 4, %bb2 ], [ 5, %bb ]
+	br label %return
+
+return:		; preds = %bb8
+	ret i32 %.0
+}
+;; NewGVN takes two passes to get test[6,8] and test[6,8]_fp's main part
+;; The icmp ne requires an equality table that inserts the inequalities for each
+;; discovered equality while processing.
+; CHECK-LABEL: @test6(
+define i1 @test6(i32 %x, i32 %y) {
+  %cmp2 = icmp ne i32 %x, %y
+  %cmp = icmp eq i32 %x, %y
+  %cmp3 = icmp eq i32 %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+; CHECK: ret i1 false
+  ret i1 %cmp2
+
+different:
+; CHECK: ret i1 false
+  ret i1 %cmp3
+}
+
+; CHECK-LABEL: @test6_fp(
+define i1 @test6_fp(float %x, float %y) {
+  %cmp2 = fcmp une float %x, %y
+  %cmp = fcmp oeq float %x, %y
+  %cmp3 = fcmp oeq float  %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+; CHECK: ret i1 false
+  ret i1 %cmp2
+
+different:
+; CHECK: ret i1 false
+  ret i1 %cmp3
+}
+; CHECK-LABEL: @test8(
+define i1 @test8(i32 %x, i32 %y) {
+  %cmp2 = icmp sle i32 %x, %y
+  %cmp = icmp sgt i32 %x, %y
+  %cmp3 = icmp sgt i32 %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+; CHECK: ret i1 false
+  ret i1 %cmp2
+
+different:
+; CHECK: ret i1 false
+  ret i1 %cmp3
+}
+
+; CHECK-LABEL: @test8_fp(
+define i1 @test8_fp(float %x, float %y) {
+  %cmp2 = fcmp ule float %x, %y
+  %cmp = fcmp ogt float %x, %y
+  %cmp3 = fcmp ogt float %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+; CHECK: ret i1 false
+  ret i1 %cmp2
+
+different:
+; CHECK: ret i1 false
+  ret i1 %cmp3
+}
+
diff --git a/test/Transforms/NewGVN/condprop.ll b/test/Transforms/NewGVN/condprop.ll
index 898690dec199..6eb9bb6b2619 100644
--- a/test/Transforms/NewGVN/condprop.ll
+++ b/test/Transforms/NewGVN/condprop.ll
@@ -1,266 +1,211 @@
-; XFAIL: *
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
 
-@a = external global i32		; <i32*> [#uses=7]
-
-; CHECK-LABEL: @test1(
-define i32 @test1() nounwind {
-entry:
-	%0 = load i32, i32* @a, align 4
-	%1 = icmp eq i32 %0, 4
-	br i1 %1, label %bb, label %bb1
-
-bb:		; preds = %entry
-	br label %bb8
-
-bb1:		; preds = %entry
-	%2 = load i32, i32* @a, align 4
-	%3 = icmp eq i32 %2, 5
-	br i1 %3, label %bb2, label %bb3
-
-bb2:		; preds = %bb1
-	br label %bb8
-
-bb3:		; preds = %bb1
-	%4 = load i32, i32* @a, align 4
-	%5 = icmp eq i32 %4, 4
-; CHECK: br i1 false, label %bb4, label %bb5
-	br i1 %5, label %bb4, label %bb5
-
-bb4:		; preds = %bb3
-	%6 = load i32, i32* @a, align 4
-	%7 = add i32 %6, 5
-	br label %bb8
-
-bb5:		; preds = %bb3
-	%8 = load i32, i32* @a, align 4
-	%9 = icmp eq i32 %8, 5
-; CHECK: br i1 false, label %bb6, label %bb7
-	br i1 %9, label %bb6, label %bb7
-
-bb6:		; preds = %bb5
-	%10 = load i32, i32* @a, align 4
-	%11 = add i32 %10, 4
-	br label %bb8
-
-bb7:		; preds = %bb5
-	%12 = load i32, i32* @a, align 4
-	br label %bb8
-
-bb8:		; preds = %bb7, %bb6, %bb4, %bb2, %bb
-	%.0 = phi i32 [ %12, %bb7 ], [ %11, %bb6 ], [ %7, %bb4 ], [ 4, %bb2 ], [ 5, %bb ]
-	br label %return
-
-return:		; preds = %bb8
-	ret i32 %.0
-}
 
 declare void @foo(i1)
 declare void @bar(i32)
 
-; CHECK-LABEL: @test3(
 define void @test3(i32 %x, i32 %y) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
+; CHECK-NEXT:    [[Z:%.*]] = and i1 [[XZ]], [[YZ]]
+; CHECK-NEXT:    br i1 [[Z]], label [[BOTH_ZERO:%.*]], label [[NOPE:%.*]]
+; CHECK:       both_zero:
+; CHECK-NEXT:    call void @foo(i1 true)
+; CHECK-NEXT:    call void @foo(i1 true)
+; CHECK-NEXT:    call void @bar(i32 0)
+; CHECK-NEXT:    call void @bar(i32 0)
+; CHECK-NEXT:    ret void
+; CHECK:       nope:
+; CHECK-NEXT:    call void @foo(i1 false)
+; CHECK-NEXT:    ret void
+;
   %xz = icmp eq i32 %x, 0
   %yz = icmp eq i32 %y, 0
   %z = and i1 %xz, %yz
   br i1 %z, label %both_zero, label %nope
 both_zero:
   call void @foo(i1 %xz)
-; CHECK: call void @foo(i1 true)
   call void @foo(i1 %yz)
-; CHECK: call void @foo(i1 true)
   call void @bar(i32 %x)
-; CHECK: call void @bar(i32 0)
   call void @bar(i32 %y)
-; CHECK: call void @bar(i32 0)
   ret void
 nope:
   call void @foo(i1 %z)
-; CHECK: call void @foo(i1 false)
   ret void
 }
-
-; CHECK-LABEL: @test4(
 define void @test4(i1 %b, i32 %x) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    br i1 [[B:%.*]], label [[SW:%.*]], label [[CASE3:%.*]]
+; CHECK:       sw:
+; CHECK-NEXT:    switch i32 [[X:%.*]], label [[DEFAULT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[CASE0:%.*]]
+; CHECK-NEXT:    i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:    i32 2, label [[CASE0]]
+; CHECK-NEXT:    i32 3, label [[CASE3]]
+; CHECK-NEXT:    i32 4, label [[DEFAULT]]
+; CHECK-NEXT:    ]
+; CHECK:       default:
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    ret void
+; CHECK:       case0:
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    ret void
+; CHECK:       case1:
+; CHECK-NEXT:    call void @bar(i32 1)
+; CHECK-NEXT:    ret void
+; CHECK:       case3:
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    ret void
+;
   br i1 %b, label %sw, label %case3
 sw:
   switch i32 %x, label %default [
-    i32 0, label %case0
-    i32 1, label %case1
-    i32 2, label %case0
-    i32 3, label %case3
-    i32 4, label %default
+  i32 0, label %case0
+  i32 1, label %case1
+  i32 2, label %case0
+  i32 3, label %case3
+  i32 4, label %default
   ]
 default:
-; CHECK: default:
   call void @bar(i32 %x)
-; CHECK: call void @bar(i32 %x)
   ret void
 case0:
-; CHECK: case0:
   call void @bar(i32 %x)
-; CHECK: call void @bar(i32 %x)
   ret void
 case1:
-; CHECK: case1:
   call void @bar(i32 %x)
-; CHECK: call void @bar(i32 1)
   ret void
 case3:
-; CHECK: case3:
   call void @bar(i32 %x)
-; CHECK: call void @bar(i32 %x)
   ret void
 }
 
-; CHECK-LABEL: @test5(
 define i1 @test5(i32 %x, i32 %y) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       different:
+; CHECK-NEXT:    ret i1 false
+;
   %cmp = icmp eq i32 %x, %y
   br i1 %cmp, label %same, label %different
 
 same:
   %cmp2 = icmp ne i32 %x, %y
-; CHECK: ret i1 false
   ret i1 %cmp2
 
 different:
   %cmp3 = icmp eq i32 %x, %y
-; CHECK: ret i1 false
   ret i1 %cmp3
 }
 
-; CHECK-LABEL: @test6(
-define i1 @test6(i32 %x, i32 %y) {
-  %cmp2 = icmp ne i32 %x, %y
-  %cmp = icmp eq i32 %x, %y
-  %cmp3 = icmp eq i32 %x, %y
-  br i1 %cmp, label %same, label %different
-
-same:
-; CHECK: ret i1 false
-  ret i1 %cmp2
 
-different:
-; CHECK: ret i1 false
-  ret i1 %cmp3
-}
-
-; CHECK-LABEL: @test6_fp(
-define i1 @test6_fp(float %x, float %y) {
-  %cmp2 = fcmp une float %x, %y
-  %cmp = fcmp oeq float %x, %y
-  %cmp3 = fcmp oeq float  %x, %y
-  br i1 %cmp, label %same, label %different
-
-same:
-; CHECK: ret i1 false
-  ret i1 %cmp2
-
-different:
-; CHECK: ret i1 false
-  ret i1 %cmp3
-}
-
-; CHECK-LABEL: @test7(
 define i1 @test7(i32 %x, i32 %y) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       different:
+; CHECK-NEXT:    ret i1 false
+;
   %cmp = icmp sgt i32 %x, %y
   br i1 %cmp, label %same, label %different
 
 same:
   %cmp2 = icmp sle i32 %x, %y
-; CHECK: ret i1 false
   ret i1 %cmp2
 
 different:
   %cmp3 = icmp sgt i32 %x, %y
-; CHECK: ret i1 false
   ret i1 %cmp3
 }
 
-; CHECK-LABEL: @test7_fp(
 define i1 @test7_fp(float %x, float %y) {
+; CHECK-LABEL: @test7_fp(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       different:
+; CHECK-NEXT:    ret i1 false
+;
   %cmp = fcmp ogt float %x, %y
   br i1 %cmp, label %same, label %different
 
 same:
   %cmp2 = fcmp ule float %x, %y
-; CHECK: ret i1 false
   ret i1 %cmp2
 
 different:
   %cmp3 = fcmp ogt float %x, %y
-; CHECK: ret i1 false
-  ret i1 %cmp3
-}
-
-; CHECK-LABEL: @test8(
-define i1 @test8(i32 %x, i32 %y) {
-  %cmp2 = icmp sle i32 %x, %y
-  %cmp = icmp sgt i32 %x, %y
-  %cmp3 = icmp sgt i32 %x, %y
-  br i1 %cmp, label %same, label %different
-
-same:
-; CHECK: ret i1 false
-  ret i1 %cmp2
-
-different:
-; CHECK: ret i1 false
-  ret i1 %cmp3
-}
-
-; CHECK-LABEL: @test8_fp(
-define i1 @test8_fp(float %x, float %y) {
-  %cmp2 = fcmp ule float %x, %y
-  %cmp = fcmp ogt float %x, %y
-  %cmp3 = fcmp ogt float %x, %y
-  br i1 %cmp, label %same, label %different
-
-same:
-; CHECK: ret i1 false
-  ret i1 %cmp2
-
-different:
-; CHECK: ret i1 false
   ret i1 %cmp3
 }
 
 ; PR1768
-; CHECK-LABEL: @test9(
 define i32 @test9(i32 %i, i32 %j) {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I:%.*]], [[J:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[RET:%.*]]
+; CHECK:       cond_true:
+; CHECK-NEXT:    ret i32 0
+; CHECK:       ret:
+; CHECK-NEXT:    ret i32 5
+;
   %cmp = icmp eq i32 %i, %j
   br i1 %cmp, label %cond_true, label %ret
 
 cond_true:
   %diff = sub i32 %i, %j
   ret i32 %diff
-; CHECK: ret i32 0
 
 ret:
   ret i32 5
-; CHECK: ret i32 5
 }
 
 ; PR1768
-; CHECK-LABEL: @test10(
 define i32 @test10(i32 %j, i32 %i) {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I:%.*]], [[J:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[RET:%.*]]
+; CHECK:       cond_true:
+; CHECK-NEXT:    ret i32 0
+; CHECK:       ret:
+; CHECK-NEXT:    ret i32 5
+;
   %cmp = icmp eq i32 %i, %j
   br i1 %cmp, label %cond_true, label %ret
 
 cond_true:
   %diff = sub i32 %i, %j
   ret i32 %diff
-; CHECK: ret i32 0
 
 ret:
   ret i32 5
-; CHECK: ret i32 5
 }
 
 declare i32 @yogibar()
 
-; CHECK-LABEL: @test11(
 define i32 @test11(i32 %x) {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[V0:%.*]] = call i32 @yogibar()
+; CHECK-NEXT:    [[V1:%.*]] = call i32 @yogibar()
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[V0]], [[V1]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[NEXT:%.*]]
+; CHECK:       cond_true:
+; CHECK-NEXT:    ret i32 [[V0]]
+; CHECK:       next:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[X:%.*]], [[V0]]
+; CHECK-NEXT:    br i1 [[CMP2]], label [[COND_TRUE2:%.*]], label [[NEXT2:%.*]]
+; CHECK:       cond_true2:
+; CHECK-NEXT:    ret i32 [[X]]
+; CHECK:       next2:
+; CHECK-NEXT:    ret i32 0
+;
   %v0 = call i32 @yogibar()
   %v1 = call i32 @yogibar()
   %cmp = icmp eq i32 %v0, %v1
@@ -268,7 +213,6 @@ define i32 @test11(i32 %x) {
 
 cond_true:
   ret i32 %v1
-; CHECK: ret i32 %v0
 
 next:
   %cmp2 = icmp eq i32 %x, %v0
@@ -276,14 +220,23 @@ next:
 
 cond_true2:
   ret i32 %v0
-; CHECK: ret i32 %x
 
 next2:
   ret i32 0
 }
 
-; CHECK-LABEL: @test12(
 define i32 @test12(i32 %x) {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+; CHECK:       cond_true:
+; CHECK-NEXT:    br label [[RET:%.*]]
+; CHECK:       cond_false:
+; CHECK-NEXT:    br label [[RET]]
+; CHECK:       ret:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ 0, [[COND_TRUE]] ], [ [[X]], [[COND_FALSE]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
   %cmp = icmp eq i32 %x, 0
   br i1 %cmp, label %cond_true, label %cond_false
 
@@ -295,6 +248,5 @@ cond_false:
 
 ret:
   %res = phi i32 [ %x, %cond_true ], [ %x, %cond_false ]
-; CHECK: %res = phi i32 [ 0, %cond_true ], [ %x, %cond_false ]
   ret i32 %res
 }
diff --git a/test/Transforms/NewGVN/deadstore.ll b/test/Transforms/NewGVN/deadstore.ll
new file mode 100644
index 000000000000..778f42b77581
--- /dev/null
+++ b/test/Transforms/NewGVN/deadstore.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
+
+;; Most of these are borrowed from transforms/DSE/simple.ll
+;; NewGVN should be able to eliminate any stores of the same value that are actually redundnat.
+
+;; tmp5 is store of the same value to the same location as the load.
+define void @test12({ i32, i32 }* %x) nounwind  {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr { i32, i32 }, { i32, i32 }* [[X:%.*]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr { i32, i32 }, { i32, i32 }* [[X]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 0, [[TMP8]]
+; CHECK-NEXT:    store i32 [[TMP17]], i32* [[TMP7]], align 4
+; CHECK-NEXT:    ret void
+;
+  %tmp4 = getelementptr { i32, i32 }, { i32, i32 }* %x, i32 0, i32 0
+  %tmp5 = load i32, i32* %tmp4, align 4
+  %tmp7 = getelementptr { i32, i32 }, { i32, i32 }* %x, i32 0, i32 1
+  %tmp8 = load i32, i32* %tmp7, align 4
+  %tmp17 = sub i32 0, %tmp8
+  store i32 %tmp5, i32* %tmp4, align 4
+  store i32 %tmp17, i32* %tmp7, align 4
+  ret void
+}
+; Remove redundant store if loaded value is in another block.
+define i32 @test26(i1 %c, i32* %p) {
+; CHECK-LABEL: @test26(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[V:%.*]] = load i32, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %v = load i32, i32* %p, align 4
+  br i1 %c, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  store i32 %v, i32* %p, align 4
+  br label %bb3
+bb3:
+  ret i32 0
+}
+
+declare void @unknown_func()
+; Remove redundant store, which is in the same loop as the load.
+define i32 @test33(i1 %c, i32* %p, i32 %i) {
+; CHECK-LABEL: @test33(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[V:%.*]] = load i32, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    call void @unknown_func()
+; CHECK-NEXT:    br i1 undef, label [[BB1]], label [[BB3:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %bb1
+bb1:
+  %v = load i32, i32* %p, align 4
+  br label %bb2
+bb2:
+  store i32 %v, i32* %p, align 4
+  ; Might read and overwrite value at %p, but doesn't matter.
+  call void @unknown_func()
+  br i1 undef, label %bb1, label %bb3
+bb3:
+  ret i32 0
+}
diff --git a/test/Transforms/NewGVN/debugloc.ll b/test/Transforms/NewGVN/debugloc.ll
new file mode 100644
index 000000000000..55597a078bbc
--- /dev/null
+++ b/test/Transforms/NewGVN/debugloc.ll
@@ -0,0 +1,78 @@
+; XFAIL: *
+; RUN: opt < %s -newgvn -S | FileCheck %s
+; CHECK: {{^}}for.body:
+; CHECK-NEXT: [[VREG1:%[^ ]+]] = phi{{.*}}[[VREG2:%[^ ]+]],{{.*}}%.sink,
+; CHECK-NOT: !dbg
+; CHECK-SAME: {{$}}
+; CHECK: {{^}}for.inc:
+; CHECK-NEXT: [[VREG2]] = phi{{.*}}%inc,{{.*}}[[VREG1]]
+
+target triple = "x86_64-unknown-linux-gnu"
+
+@g = external local_unnamed_addr global i32, align 4
+
+; Function Attrs: nounwind uwtable
+define void @foo(i32 %x, i32 %y, i32 %z) local_unnamed_addr #0 !dbg !4 {
+entry:
+  %not.tobool = icmp eq i32 %x, 0, !dbg !8
+  %.sink = zext i1 %not.tobool to i32, !dbg !8
+  store i32 %.sink, i32* @g, align 4, !tbaa !9
+  %cmp8 = icmp sgt i32 %y, 0, !dbg !13
+  br i1 %cmp8, label %for.body.preheader, label %for.end, !dbg !17
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !19
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %i.09 = phi i32 [ %inc4, %for.inc ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp sgt i32 %i.09, %z, !dbg !19
+  br i1 %cmp1, label %if.then2, label %for.inc, !dbg !21
+
+if.then2:                                         ; preds = %for.body
+  %0 = load i32, i32* @g, align 4, !dbg !22, !tbaa !9
+  %inc = add nsw i32 %0, 1, !dbg !22
+  store i32 %inc, i32* @g, align 4, !dbg !22, !tbaa !9
+  br label %for.inc, !dbg !23
+
+for.inc:                                          ; preds = %for.body, %if.then2
+  %inc4 = add nuw nsw i32 %i.09, 1, !dbg !24
+  %exitcond = icmp ne i32 %inc4, %y, !dbg !13
+  br i1 %exitcond, label %for.body, label %for.end.loopexit, !dbg !17
+
+for.end.loopexit:                                 ; preds = %for.inc
+  br label %for.end, !dbg !26
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void, !dbg !26
+}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1)
+!1 = !DIFile(filename: "foo.c", directory: "b/")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!5 = !DISubroutineType(types: !6)
+!6 = !{null, !7, !7, !7}
+!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!8 = !DILocation(line: 4, column: 7, scope: !4)
+!9 = !{!10, !10, i64 0}
+!10 = !{!"int", !11, i64 0}
+!11 = !{!"omnipotent char", !12, i64 0}
+!12 = !{!"Simple C/C++ TBAA"}
+!13 = !DILocation(line: 10, column: 13, scope: !14)
+!14 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 1)
+!15 = distinct !DILexicalBlock(scope: !16, file: !1, line: 10, column: 3)
+!16 = distinct !DILexicalBlock(scope: !4, file: !1, line: 10, column: 3)
+!17 = !DILocation(line: 10, column: 3, scope: !18)
+!18 = !DILexicalBlockFile(scope: !16, file: !1, discriminator: 1)
+!19 = !DILocation(line: 11, column: 11, scope: !20)
+!20 = distinct !DILexicalBlock(scope: !15, file: !1, line: 11, column: 9)
+!21 = !DILocation(line: 11, column: 9, scope: !15)
+!22 = !DILocation(line: 12, column: 8, scope: !20)
+!23 = !DILocation(line: 12, column: 7, scope: !20)
+!24 = !DILocation(line: 10, column: 20, scope: !25)
+!25 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 2)
+!26 = !DILocation(line: 13, column: 1, scope: !4)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
diff --git a/test/Transforms/NewGVN/edge.ll b/test/Transforms/NewGVN/edge.ll
index 2d453bda5a4a..a8afc140e218 100644
--- a/test/Transforms/NewGVN/edge.ll
+++ b/test/Transforms/NewGVN/edge.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; RUN: opt -newgvn -S < %s | FileCheck %s
 
 define i32 @f1(i32 %x) {
diff --git a/test/Transforms/NewGVN/fence.ll b/test/Transforms/NewGVN/fence.ll
index ac4270d9aceb..190fd7344922 100644
--- a/test/Transforms/NewGVN/fence.ll
+++ b/test/Transforms/NewGVN/fence.ll
@@ -1,6 +1,7 @@
 ; XFAIL: *
 ; RUN: opt -S -basicaa -newgvn < %s | FileCheck %s
 
+@a = external constant i32
 ; We can value forward across the fence since we can (semantically) 
 ; reorder the following load before the fence.
 define i32 @test(i32* %addr.i) {
@@ -53,6 +54,25 @@ define i32 @test3(i32* noalias %addr.i, i32* noalias %otheraddr) {
   ret i32 %res
 }
 
+; We can forward the value forward the load
+; across both the fences, because the load is from
+; a constant memory location.
+define i32 @test4(i32* %addr) {
+; CHECK-LABEL: @test4
+; CHECK-NOT: load
+; CHECK: fence release
+; CHECK: store
+; CHECK: fence seq_cst
+; CHECK: ret i32 0
+  %var = load i32, i32* @a
+  fence release
+  store i32 42, i32* %addr, align 8
+  fence seq_cst
+  %var2 = load i32, i32* @a
+  %var3 = sub i32 %var, %var2
+  ret i32 %var3
+}
+
 ; Another example of why forwarding across an acquire fence is problematic
 ; can be seen in a normal locking operation.  Say we had:
 ; *p = 5; unlock(l); lock(l); use(p);
diff --git a/test/Transforms/NewGVN/flags.ll b/test/Transforms/NewGVN/flags.ll
index d03edd6776c9..e849ae2afb64 100644
--- a/test/Transforms/NewGVN/flags.ll
+++ b/test/Transforms/NewGVN/flags.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; RUN: opt -newgvn -S < %s | FileCheck %s
 
 declare void @use(i1)
diff --git a/test/Transforms/NewGVN/fold-const-expr.ll b/test/Transforms/NewGVN/fold-const-expr.ll
index 20b74277b1ac..acd7c8df2530 100644
--- a/test/Transforms/NewGVN/fold-const-expr.ll
+++ b/test/Transforms/NewGVN/fold-const-expr.ll
@@ -1,11 +1,10 @@
-; XFAIL: *
 ; GVN failed to do constant expression folding and expanded
 ; them unfolded in many places, producing exponentially large const
 ; expressions. As a result, the compilation never fisished.
 ; This test checks that we are folding constant expression
 ; PR 28418
 ; RUN: opt -newgvn -S < %s | FileCheck %s
-
+;; NewGVN fails this due to not having load coercion
 %2 = type { i32, i32, i32, i32, i32 }
 define i32 @_Z16vector3util_mainv(i32 %x, i32 %y)  {
   %tmp1 = alloca %2, align 4
diff --git a/test/Transforms/NewGVN/lifetime-simple.ll b/test/Transforms/NewGVN/lifetime-simple.ll
index 63e361c49eb9..382c7da2b3fb 100644
--- a/test/Transforms/NewGVN/lifetime-simple.ll
+++ b/test/Transforms/NewGVN/lifetime-simple.ll
@@ -8,13 +8,13 @@ define i8 @test(i8* %P) nounwind {
 ; CHECK-NOT: load
 ; CHECK: lifetime.end
 entry:
-  call void @llvm.lifetime.start(i64 32, i8* %P)
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %P)
   %0 = load i8, i8* %P
   store i8 1, i8* %P
-  call void @llvm.lifetime.end(i64 32, i8* %P)
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %P)
   %1 = load i8, i8* %P
   ret i8 %1
 }
 
-declare void @llvm.lifetime.start(i64 %S, i8* nocapture %P) readonly
-declare void @llvm.lifetime.end(i64 %S, i8* nocapture %P)
+declare void @llvm.lifetime.start.p0i8(i64 %S, i8* nocapture %P) readonly
+declare void @llvm.lifetime.end.p0i8(i64 %S, i8* nocapture %P)
diff --git a/test/Transforms/NewGVN/load-constant-mem.ll b/test/Transforms/NewGVN/load-constant-mem.ll
index 215258b934c0..4c1624e09f60 100644
--- a/test/Transforms/NewGVN/load-constant-mem.ll
+++ b/test/Transforms/NewGVN/load-constant-mem.ll
@@ -1,19 +1,21 @@
-; RUN: opt < %s -basicaa -newgvn -instcombine -S | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
 ; PR4189
 @G = external constant [4 x i32]
 
 define i32 @test(i8* %p, i32 %i) nounwind {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P:%.*]] = getelementptr [4 x i32], [4 x i32]* @G, i32 0, i32 [[I:%.*]]
+; CHECK-NEXT:    store i8 4, i8* [[P:%.*]]
+; CHECK-NEXT:    ret i32 0
+;
 entry:
-	%P = getelementptr [4 x i32], [4 x i32]* @G, i32 0, i32 %i
-	%A = load i32, i32* %P
-	store i8 4, i8* %p
-	%B = load i32, i32* %P
-	%C = sub i32 %A, %B
-	ret i32 %C
+  %P = getelementptr [4 x i32], [4 x i32]* @G, i32 0, i32 %i
+  %A = load i32, i32* %P
+  store i8 4, i8* %p
+  %B = load i32, i32* %P
+  %C = sub i32 %A, %B
+  ret i32 %C
 }
 
-; CHECK: define i32 @test(i8* %p, i32 %i) #0 {
-; CHECK-NEXT: entry:
-; CHECK-NEXT:   store i8 4, i8* %p, align 1
-; CHECK-NEXT:   ret i32 0
-; CHECK-NEXT: }
diff --git a/test/Transforms/NewGVN/loadforward.ll b/test/Transforms/NewGVN/loadforward.ll
new file mode 100644
index 000000000000..d66b5332601f
--- /dev/null
+++ b/test/Transforms/NewGVN/loadforward.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%rec11 = type { i16, i16, i16 }
+
+@str = global %rec11 { i16 1, i16 2, i16 3 }
+
+;; Test that we forward the first store to the second load
+define i16 @bazinga() {
+; CHECK-LABEL: @bazinga(
+; CHECK-NEXT:    [[_TMP10:%.*]] = load i16, i16* getelementptr inbounds (%rec11, %rec11* @str, i16 0, i32 1)
+; CHECK-NEXT:    store i16 [[_TMP10]], i16* getelementptr inbounds (%rec11, %rec11* @str, i16 0, i32 0)
+; CHECK-NEXT:    [[_TMP15:%.*]] = icmp eq i16 [[_TMP10]], 3
+; CHECK-NEXT:    [[_TMP16:%.*]] = select i1 [[_TMP15]], i16 1, i16 0
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    ret i16 [[_TMP16]]
+;
+  %_tmp9 = getelementptr %rec11, %rec11* @str, i16 0, i32 1
+  %_tmp10 = load i16, i16* %_tmp9
+  %_tmp12 = getelementptr %rec11, %rec11* @str, i16 0, i32 0
+  store i16 %_tmp10, i16* %_tmp12
+  %_tmp13 = getelementptr %rec11, %rec11* @str, i16 0, i32 0
+  %_tmp14 = load i16, i16* %_tmp13
+  %_tmp15 = icmp eq i16 %_tmp14, 3
+  %_tmp16 = select i1 %_tmp15, i16 1, i16 0
+  br label %bb1
+
+bb1:
+  ret i16 %_tmp16
+}
diff --git a/test/Transforms/NewGVN/malloc-load-removal.ll b/test/Transforms/NewGVN/malloc-load-removal.ll
index c91b6e17f79d..72f4839a5545 100644
--- a/test/Transforms/NewGVN/malloc-load-removal.ll
+++ b/test/Transforms/NewGVN/malloc-load-removal.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; RUN: opt -S -basicaa -newgvn < %s | FileCheck %s
 ; RUN: opt -S -basicaa -newgvn -disable-simplify-libcalls < %s | FileCheck %s -check-prefix=CHECK_NO_LIBCALLS
 ; PR13694
diff --git a/test/Transforms/NewGVN/phi-edge-handling.ll b/test/Transforms/NewGVN/phi-edge-handling.ll
new file mode 100644
index 000000000000..6451006a6949
--- /dev/null
+++ b/test/Transforms/NewGVN/phi-edge-handling.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -newgvn -S | FileCheck %s
+
+
+;; Block 6 is reachable, but edge 6->4 is not
+;; This means the phi value is undef, not 0
+; Function Attrs: ssp uwtable
+define i16 @hoge() local_unnamed_addr #0 align 2 {
+; CHECK-LABEL: @hoge(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    switch i8 undef, label [[BB7:%.*]] [
+; CHECK-NEXT:    i8 0, label [[BB1:%.*]]
+; CHECK-NEXT:    i8 12, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB6:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB4:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    unreachable
+; CHECK:       bb4:
+; CHECK-NEXT:    ret i16 undef
+; CHECK:       bb6:
+; CHECK-NEXT:    br i1 true, label [[BB3:%.*]], label [[BB4]], !llvm.loop !1
+; CHECK:       bb7:
+; CHECK-NEXT:    unreachable
+;
+bb:
+  switch i8 undef, label %bb7 [
+  i8 0, label %bb1
+  i8 12, label %bb2
+  ]
+
+bb1:                                              ; preds = %bb
+  br label %bb6
+
+bb2:                                              ; preds = %bb
+  br label %bb4
+
+bb3:                                              ; preds = %bb6
+  unreachable
+
+bb4:                                              ; preds = %bb6, %bb2
+  %tmp = phi i16 [ 0, %bb6 ], [ undef, %bb2 ]
+  ret i16 %tmp
+
+bb6:                                              ; preds = %bb4
+  br i1 true, label %bb3, label %bb4, !llvm.loop !1
+
+bb7:                                              ; preds = %bb
+  unreachable
+}
+
+attributes #0 = { ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 5.0.0 (http://llvm.org/git/clang.git a8b933d4d1d133594fdaed35ee5814514b738f6d) (/Users/dannyb/sources/llvm-clean fc630a9b5613f544c07a8f16abcc173793df62cf)"}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.unroll.disable"}
diff --git a/test/Transforms/NewGVN/pr10820.ll b/test/Transforms/NewGVN/pr10820.ll
index d7a02b570aa0..dbb1376874db 100644
--- a/test/Transforms/NewGVN/pr10820.ll
+++ b/test/Transforms/NewGVN/pr10820.ll
@@ -1,6 +1,6 @@
 ; XFAIL: *
 ; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
-
+; NewGVN fails this due to missing load coercion
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/NewGVN/pr14166.ll b/test/Transforms/NewGVN/pr14166.ll
index daf27cdb7fd7..c526c50bc75d 100644
--- a/test/Transforms/NewGVN/pr14166.ll
+++ b/test/Transforms/NewGVN/pr14166.ll
@@ -1,5 +1,6 @@
 ; XFAIL: *
 ; RUN: opt -disable-basicaa -newgvn -S < %s | FileCheck %s
+; NewGVN fails this due to missing load coercion
 target datalayout = "e-p:32:32:32"
 target triple = "i386-pc-linux-gnu"
 define <2 x i32> @test1() {
diff --git a/test/Transforms/NewGVN/pr17732.ll b/test/Transforms/NewGVN/pr17732.ll
index 4a194e6a08b5..6aee6ebeb065 100644
--- a/test/Transforms/NewGVN/pr17732.ll
+++ b/test/Transforms/NewGVN/pr17732.ll
@@ -1,6 +1,4 @@
-; XFAIL: *
 ; RUN: opt -newgvn -S -o - < %s | FileCheck %s
-
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Transforms/NewGVN/pr31594.ll b/test/Transforms/NewGVN/pr31594.ll
index 0cdac1a7fff4..8ef8aa66df1f 100644
--- a/test/Transforms/NewGVN/pr31594.ll
+++ b/test/Transforms/NewGVN/pr31594.ll
@@ -3,7 +3,7 @@
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
-define void @patatino(i8* %blah, i32 %choice) {
+define i1 @patatino(i8* %blah, i32 %choice) {
 ; CHECK-LABEL: @patatino(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
@@ -19,8 +19,10 @@ define void @patatino(i8* %blah, i32 %choice) {
 ; CHECK-NEXT:    br label [[WHILE_COND]]
 ; CHECK:       while.end:
 ; CHECK-NEXT:    store i8 0, i8* [[FOO]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[BLAH]], align 1
+; CHECK-NEXT:    [[LOADED:%.*]] = icmp eq i8 [[TMP0]], 0
 ; CHECK-NEXT:    store i8 0, i8* [[BLAH]], align 1
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret i1 [[LOADED]]
 ;
 entry:
   br label %while.cond
@@ -48,7 +50,7 @@ while.end:
   %0 = load i8, i8* %blah, align 1
   %loaded = icmp eq i8 %0, 0
   store i8 0, i8* %blah, align 1
-  ret void
+  ret i1 %loaded
 }
 
 
@@ -75,6 +77,7 @@ define void @foo(i8* %arg) {
 ; CHECK-NEXT:    i8 6, label [[BB8:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb8:
+; CHECK-NEXT:    store i8 undef, i8* null
 ; CHECK-NEXT:    br label [[BB4]]
 ; CHECK:       bb9:
 ; CHECK-NEXT:    store i8 0, i8* [[ARG]], !g !0
diff --git a/test/Transforms/NewGVN/pr31613.ll b/test/Transforms/NewGVN/pr31613.ll
index d3a41830c789..d96ea18466ad 100644
--- a/test/Transforms/NewGVN/pr31613.ll
+++ b/test/Transforms/NewGVN/pr31613.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
+; RUN: opt < %s -basicaa -newgvn -enable-store-refinement -S | FileCheck %s
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 ;; Both of these tests are tests of phi nodes that end up all equivalent to each other
@@ -78,21 +78,18 @@ define void @e() {
 ; CHECK-NEXT:    br label [[H:%.*]]
 ; CHECK:       h:
 ; CHECK-NEXT:    call void @c.d.p(i64 8, i8* undef)
-; CHECK-NEXT:    [[I:%.*]] = load i32, i32* [[F]]
 ; CHECK-NEXT:    [[J:%.*]] = load i32, i32* null
-; CHECK-NEXT:    [[K:%.*]] = icmp eq i32 [[I]], [[J]]
-; CHECK-NEXT:    br i1 [[K]], label [[L:%.*]], label [[Q:%.*]]
+; CHECK-NEXT:    br i1 true, label [[L:%.*]], label [[Q:%.*]]
 ; CHECK:       l:
 ; CHECK-NEXT:    br label [[R:%.*]]
 ; CHECK:       q:
-; CHECK-NEXT:    [[M:%.*]] = load %struct.a*, %struct.a** null
+; CHECK-NEXT:    store i8 undef, i8* null
 ; CHECK-NEXT:    br label [[R]]
 ; CHECK:       r:
 ; CHECK-NEXT:    switch i32 undef, label [[N:%.*]] [
 ; CHECK-NEXT:    i32 0, label [[S:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       s:
-; CHECK-NEXT:    store i32 undef, i32* [[F]], !g !0
 ; CHECK-NEXT:    br label [[H]]
 ; CHECK:       n:
 ; CHECK-NEXT:    [[O:%.*]] = load %struct.a*, %struct.a** null
diff --git a/test/Transforms/NewGVN/pr31682.ll b/test/Transforms/NewGVN/pr31682.ll
index 108e1e19afbd..96103fad15c2 100644
--- a/test/Transforms/NewGVN/pr31682.ll
+++ b/test/Transforms/NewGVN/pr31682.ll
@@ -12,7 +12,6 @@ define void @bar() {
 ; CHECK-NEXT:    [[TMP:%.*]] = load %struct.foo*, %struct.foo** @global
 ; CHECK-NEXT:    br label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_FOO:%.*]], %struct.foo* [[TMP]], i64 0, i32 1
 ; CHECK-NEXT:    br i1 undef, label [[BB2]], label [[BB7:%.*]]
 ; CHECK:       bb7:
 ; CHECK-NEXT:    br label [[BB10:%.*]]
diff --git a/test/Transforms/NewGVN/pr31758.ll b/test/Transforms/NewGVN/pr31758.ll
new file mode 100644
index 000000000000..6052ca973aff
--- /dev/null
+++ b/test/Transforms/NewGVN/pr31758.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -newgvn %s -S -o - | FileCheck %s
+
+%struct.dipsy = type {}
+%struct.fluttershy = type { %struct.dipsy* }
+%struct.patatino = type {}
+
+define void @tinkywinky() {
+; CHECK-LABEL: @tinkywinky(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB90:%.*]]
+; CHECK:       bb90:
+; CHECK-NEXT:    br label [[BB90]]
+; CHECK:       bb138:
+; CHECK-NEXT:    store i8 undef, i8* null
+; CHECK-NEXT:    br label [[BB138:%.*]]
+;
+bb:
+  br label %bb90
+
+bb90:
+  %tmp = getelementptr inbounds %struct.fluttershy, %struct.fluttershy* undef, i64 0, i32 0
+  %tmp91 = bitcast %struct.dipsy** %tmp to %struct.patatino**
+  %tmp92 = load %struct.patatino*, %struct.patatino** %tmp91, align 8
+  %tmp99 = getelementptr inbounds %struct.patatino, %struct.patatino* %tmp92
+  %tmp134 = getelementptr inbounds %struct.fluttershy, %struct.fluttershy* undef, i64 0, i32 0
+  %tmp135 = bitcast %struct.dipsy** %tmp134 to %struct.patatino**
+  %tmp136 = load %struct.patatino*, %struct.patatino** %tmp135, align 8
+  br label %bb90
+
+bb138:
+  %tmp139 = getelementptr inbounds %struct.patatino, %struct.patatino* %tmp136
+  br label %bb138
+}
diff --git a/test/Transforms/NewGVN/pr32403.ll b/test/Transforms/NewGVN/pr32403.ll
new file mode 100644
index 000000000000..2552e0e66ab9
--- /dev/null
+++ b/test/Transforms/NewGVN/pr32403.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+;RUN: opt -newgvn -S < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Function Attrs: nounwind ssp uwtable
+define void @reorder_ref_pic_list() local_unnamed_addr {
+; CHECK-LABEL: @reorder_ref_pic_list(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[REFIDXLX_0:%.*]] = phi i32 [ [[INC_I51:%.*]], [[IF_ELSE58:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN13:%.*]], label [[IF_ELSE58]]
+; CHECK:       if.then13:
+; CHECK-NEXT:    [[INC_I:%.*]] = add nsw i32 [[REFIDXLX_0]], 1
+; CHECK-NEXT:    br label [[FOR_BODY8_I:%.*]]
+; CHECK:       for.body8.i:
+; CHECK-NEXT:    br i1 undef, label [[FOR_INC24_I:%.*]], label [[IF_THEN17_I:%.*]]
+; CHECK:       if.then17.i:
+; CHECK-NEXT:    br label [[FOR_INC24_I]]
+; CHECK:       for.inc24.i:
+; CHECK-NEXT:    br label [[FOR_BODY8_I]]
+; CHECK:       if.else58:
+; CHECK-NEXT:    [[INC_I51]] = add nsw i32 [[REFIDXLX_0]], 1
+; CHECK-NEXT:    br label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 undef, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %if.else58, %for.body.preheader
+  %refIdxLX.0 = phi i32 [ %inc.i51, %if.else58 ], [ 0, %for.body.preheader ]
+  br i1 undef, label %if.then13, label %if.else58
+
+if.then13:                                        ; preds = %for.body
+  %inc.i = add nsw i32 %refIdxLX.0, 1
+  br label %for.body8.i
+
+for.body8.i:                                      ; preds = %for.inc24.i, %if.then13
+  %nIdx.052.i = phi i32 [ %inc.i, %if.then13 ], [ %nIdx.1.i, %for.inc24.i ]
+  br i1 undef, label %for.inc24.i, label %if.then17.i
+
+if.then17.i:                                      ; preds = %for.body8.i
+  br label %for.inc24.i
+
+for.inc24.i:                                      ; preds = %if.then17.i, %for.body8.i
+  %nIdx.1.i = phi i32 [ undef, %if.then17.i ], [ %nIdx.052.i, %for.body8.i ]
+  br label %for.body8.i
+
+if.else58:                                        ; preds = %for.body
+  %inc.i51 = add nsw i32 %refIdxLX.0, 1
+  br label %for.body
+
+for.end:                                          ; preds = %entry
+  ret void
+}
+
+
+
diff --git a/test/Transforms/NewGVN/pr32607.ll b/test/Transforms/NewGVN/pr32607.ll
new file mode 100644
index 000000000000..203ac75e2d07
--- /dev/null
+++ b/test/Transforms/NewGVN/pr32607.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -newgvn %s -S -o - | FileCheck %s
+define hidden void @foo() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  top:
+; CHECK-NEXT:    br label [[IF:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi double [ [[TMP1:%.*]], [[IF]] ], [ undef, [[TOP:%.*]] ]
+; CHECK-NEXT:    [[TMP1]] = fadd double [[TMP0]], undef
+; CHECK-NEXT:    br i1 false, label [[L50:%.*]], label [[IF]]
+; CHECK:       L50:
+; CHECK-NEXT:    store i8 undef, i8* null
+; CHECK-NEXT:    ret void
+;
+top:
+  %.promoted = load double, double* undef, align 8
+  br label %if
+
+;; This is really a multi-valued phi, because the phi is defined by an expression of the phi.
+;; This means that we can't propagate the value over the backedge, because we'll just cycle
+;; through every value.
+
+if:                                               ; preds = %if, %top
+  %0 = phi double [ %1, %if ], [ %.promoted, %top ]
+  %1 = fadd double %0, undef
+  br i1 false, label %L50, label %if
+
+L50:                                              ; preds = %if
+  %.lcssa = phi double [ %1, %if ]
+  store double %.lcssa, double* undef, align 8
+  ret void
+}
+
diff --git a/test/Transforms/NewGVN/predicates.ll b/test/Transforms/NewGVN/predicates.ll
new file mode 100644
index 000000000000..61b35c5e5c67
--- /dev/null
+++ b/test/Transforms/NewGVN/predicates.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -basicaa -newgvn -S < %s | FileCheck %s
+
+; Function Attrs: noinline norecurse nounwind readonly ssp uwtable
+define i32 @mp_unsgn_cmp(i32 %n, i32* nocapture readonly %in1, i32* nocapture readonly %in2) local_unnamed_addr {
+; CHECK-LABEL: @mp_unsgn_cmp(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], -1
+; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_INC_PREHEADER:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       for.inc.preheader:
+; CHECK-NEXT:    br label [[FOR_INC:%.*]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[STOREMERGE2:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_INC]] ], [ 0, [[FOR_INC_PREHEADER]] ]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[STOREMERGE2]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[IN1:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[IN2:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[STOREMERGE2]], 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[STOREMERGE2]], [[N]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[SUB]], 0
+; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[CMP2]], [[CMP1]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[FOR_INC]], label [[FOR_END:%.*]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[SUB]], 0
+; CHECK-NEXT:    br i1 [[CMP5]], label [[IF_END8:%.*]], label [[IF_ELSE]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[SUB1_LCSSA4:%.*]] = phi i32 [ [[SUB]], [[FOR_END]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[CMP6:%.*]] = icmp slt i32 [[SUB1_LCSSA4]], 0
+; CHECK-NEXT:    [[DOTSUB1_LCSSA:%.*]] = select i1 [[CMP6]], i32 -1, i32 [[SUB1_LCSSA4]]
+; CHECK-NEXT:    ret i32 [[DOTSUB1_LCSSA]]
+; CHECK:       if.end8:
+; CHECK-NEXT:    ret i32 1
+;
+entry:
+  %cmp11 = icmp sgt i32 %n, -1
+  br i1 %cmp11, label %for.inc.preheader, label %if.else
+
+for.inc.preheader:                                ; preds = %entry
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.inc.preheader, %for.inc
+  %storemerge2 = phi i32 [ %inc, %for.inc ], [ 0, %for.inc.preheader ]
+  %idxprom = sext i32 %storemerge2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %in1, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds i32, i32* %in2, i64 %idxprom
+  %1 = load i32, i32* %arrayidx4, align 4
+  %sub = sub nsw i32 %0, %1
+  %inc = add nsw i32 %storemerge2, 1
+  %cmp1 = icmp slt i32 %storemerge2, %n
+  %cmp2 = icmp eq i32 %sub, 0
+  %or.cond = and i1 %cmp2, %cmp1
+;; This is a self-critical edge to for.inc. If we insert predicate info on it, we will insert
+;; predicateinfo at the end of this block, and think it dominates everthing using only dfs
+;; numbers, instead of proper edge dominance.  We would then proceed to propagate the true value
+;; of sub == 0 everywhere, making this function only ever return 0.
+  br i1 %or.cond, label %for.inc, label %for.end
+
+for.end:                                          ; preds = %for.inc
+  %sub.lcssa = phi i32 [ %sub, %for.inc ]
+  %cmp5 = icmp sgt i32 %sub.lcssa, 0
+  br i1 %cmp5, label %if.end8, label %if.else
+
+if.else:                                          ; preds = %entry, %for.end
+  %sub1.lcssa4 = phi i32 [ %sub.lcssa, %for.end ], [ 0, %entry ]
+  %cmp6 = icmp slt i32 %sub1.lcssa4, 0
+  %.sub1.lcssa = select i1 %cmp6, i32 -1, i32 %sub1.lcssa4
+  ret i32 %.sub1.lcssa
+
+if.end8:                                          ; preds = %for.end
+  ret i32 1
+}
+
+
+;; This test will generate a copy of a copy of predicateinfo to the multiple uses
+;; of branch conditions below.  Make sure we don't try to extract operand info.
+; Function Attrs: uwtable
+define fastcc void @barney() {
+; CHECK-LABEL: @barney(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB22:%.*]]
+; CHECK:       bb22:
+; CHECK-NEXT:    br i1 undef, label [[BB29:%.*]], label [[BB35:%.*]]
+; CHECK:       bb29:
+; CHECK-NEXT:    br i1 true, label [[BB33:%.*]], label [[BB35]]
+; CHECK:       bb33:
+; CHECK-NEXT:    br i1 true, label [[BB35]], label [[BB35]]
+; CHECK:       bb35:
+; CHECK-NEXT:    unreachable
+;
+bb:
+  br label %bb22
+bb22:                                             ; preds = %bb21
+  %tmp23 = icmp eq i32 undef, 2
+  br i1 %tmp23, label %bb29, label %bb35
+
+
+bb29:                                             ; preds = %bb28
+  br i1 %tmp23, label %bb33, label %bb35
+
+
+bb33:                                             ; preds = %bb31
+  br i1 %tmp23, label %bb35, label %bb35
+
+
+bb35:                                             ; preds = %bb33, %bb29, %bb22
+  unreachable
+}
+
diff --git a/test/Transforms/NewGVN/propagate-ir-flags.ll b/test/Transforms/NewGVN/propagate-ir-flags.ll
index bb2f78d41d4f..f8904e87582b 100644
--- a/test/Transforms/NewGVN/propagate-ir-flags.ll
+++ b/test/Transforms/NewGVN/propagate-ir-flags.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; RUN: opt < %s -newgvn -S | FileCheck %s
 
 ; CHECK-LABEL: func_fast
diff --git a/test/Transforms/NewGVN/readattrs.ll b/test/Transforms/NewGVN/readattrs.ll
index be5fbf5a806f..29ddb97ca1bb 100644
--- a/test/Transforms/NewGVN/readattrs.ll
+++ b/test/Transforms/NewGVN/readattrs.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; RUN: opt -newgvn -S -o - < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
diff --git a/test/Transforms/NewGVN/refine-stores.ll b/test/Transforms/NewGVN/refine-stores.ll
new file mode 100644
index 000000000000..a48f2fe7fdb6
--- /dev/null
+++ b/test/Transforms/NewGVN/refine-stores.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
+;; Now that we do store refinement, we have to verify that we add fake uses
+;; when we skip existing stores.
+;; We also are testing that various variations that cause stores to move classes
+;; have the right class movement happen
+;; All of these tests result in verification failures if it does not.
+%struct.eggs = type {}
+
+define void @spam(i32 *%a) {
+; CHECK-LABEL: @spam(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[FOO:%.*]] = bitcast i32* [[A:%.*]] to %struct.eggs**
+; CHECK-NEXT:    store %struct.eggs* null, %struct.eggs** [[FOO]]
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br i1 undef, label [[BB3:%.*]], label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    call void @baz()
+; CHECK-NEXT:    br label [[BB1]]
+; CHECK:       bb3:
+; CHECK-NEXT:    store i32 0, i32* undef
+; CHECK-NEXT:    store %struct.eggs* null, %struct.eggs** [[FOO]]
+; CHECK-NEXT:    unreachable
+;
+bb:
+  %foo = bitcast i32 *%a to %struct.eggs**
+  store %struct.eggs* null, %struct.eggs** %foo
+  br label %bb1
+
+bb1:                                              ; preds = %bb2, %bb
+  br i1 undef, label %bb3, label %bb2
+
+bb2:                                              ; preds = %bb1
+  call void @baz()
+  br label %bb1
+
+bb3:                                              ; preds = %bb1
+  store i32 0, i32* undef
+;; This store is defined by a memoryphi of the call and the first store
+;; At first, we will prove it equivalent to the first store above.
+;; Then the call will become reachable, and the equivalence will be removed
+;; Without it being a use of the first store, we will not update the store
+;; to reflect this.
+  store %struct.eggs* null, %struct.eggs** %foo
+  unreachable
+}
+
+declare void @baz()
+
+
+define void @a() {
+; CHECK-LABEL: @a(
+; CHECK-NEXT:  b:
+; CHECK-NEXT:    br label [[C:%.*]]
+; CHECK:       c:
+; CHECK-NEXT:    store i64 undef, i64* null
+; CHECK-NEXT:    br label [[E:%.*]]
+; CHECK:       e:
+; CHECK-NEXT:    [[G:%.*]] = load i64*, i64** null
+; CHECK-NEXT:    store i64* undef, i64** null
+; CHECK-NEXT:    br i1 undef, label [[C]], label [[E]]
+;
+b:
+  br label %c
+
+c:                                                ; preds = %e, %b
+  %d = phi i64* [ undef, %b ], [ null, %e ]
+  store i64 undef, i64* %d
+  br label %e
+
+e:                                                ; preds = %e, %c
+;; The memory for this load starts out equivalent to just the store in c, we later discover the store after us, and
+;; need to make sure the right set of values get marked as changed after memory leaders change
+  %g = load i64*, i64** null
+  %0 = bitcast i64* %g to i64*
+  store i64* undef, i64** null
+  br i1 undef, label %c, label %e
+}
+
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+source_filename = "bugpoint-output-daef094.bc"
+target triple = "x86_64-apple-darwin16.5.0"
+
+%struct.hoge = type {}
+
+define void @widget(%struct.hoge* %arg) {
+; CHECK-LABEL: @widget(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP:%.*]] = phi %struct.hoge* [ [[ARG:%.*]], [[BB:%.*]] ], [ null, [[BB1]] ]
+; CHECK-NEXT:    store %struct.hoge* [[TMP]], %struct.hoge** undef
+; CHECK-NEXT:    br i1 undef, label [[BB1]], label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i64 [ [[TMP8:%.*]], [[BB7:%.*]] ], [ 0, [[BB1]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[BB7]], label [[BB5:%.*]]
+; CHECK:       bb5:
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, i64* null
+; CHECK-NEXT:    call void @quux()
+; CHECK-NEXT:    store i64 [[TMP6]], i64* undef
+; CHECK-NEXT:    br label [[BB7]]
+; CHECK:       bb7:
+; CHECK-NEXT:    [[TMP8]] = add i64 [[TMP3]], 1
+; CHECK-NEXT:    br label [[BB2]]
+;
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb1, %bb
+  %tmp = phi %struct.hoge* [ %arg, %bb ], [ null, %bb1 ]
+  store %struct.hoge* %tmp, %struct.hoge** undef
+  br i1 undef, label %bb1, label %bb2
+
+bb2:                                              ; preds = %bb7, %bb1
+  %tmp3 = phi i64 [ %tmp8, %bb7 ], [ 0, %bb1 ]
+  %tmp4 = icmp eq i64 %tmp3, 0
+  br i1 %tmp4, label %bb7, label %bb5
+
+bb5:                                              ; preds = %bb2
+  ;; Originally thought equal to the store that comes after it until the phi edges
+  ;; are completely traversed
+  %tmp6 = load i64, i64* null
+  call void @quux()
+  store i64 %tmp6, i64* undef
+  br label %bb7
+
+bb7:                                              ; preds = %bb5, %bb2
+  %tmp8 = add i64 %tmp3, 1
+  br label %bb2
+}
+
+declare void @quux()
+; ModuleID = 'short.ll'
+source_filename = "short.ll"
+
+%struct.a = type {}
+
+define void @b() {
+; CHECK-LABEL: @b(
+; CHECK-NEXT:    [[C:%.*]] = alloca [[STRUCT_A:%.*]]
+; CHECK-NEXT:    br label [[D:%.*]]
+; CHECK:       m:
+; CHECK-NEXT:    unreachable
+; CHECK:       d:
+; CHECK-NEXT:    [[G:%.*]] = bitcast %struct.a* [[C]] to i8*
+; CHECK-NEXT:    [[F:%.*]] = bitcast i8* [[G]] to i32*
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[F]]
+; CHECK-NEXT:    br i1 undef, label [[I:%.*]], label [[J:%.*]]
+; CHECK:       i:
+; CHECK-NEXT:    br i1 undef, label [[K:%.*]], label [[M:%.*]]
+; CHECK:       k:
+; CHECK-NEXT:    br label [[L:%.*]]
+; CHECK:       l:
+; CHECK-NEXT:    unreachable
+; CHECK:       j:
+; CHECK-NEXT:    br label [[M]]
+;
+  %c = alloca %struct.a
+  br label %d
+
+m:                                                ; preds = %j, %i
+  store i32 %e, i32* %f
+  unreachable
+
+d:                                                ; preds = %0
+  %g = bitcast %struct.a* %c to i8*
+  %h = getelementptr i8, i8* %g
+  %f = bitcast i8* %h to i32*
+  %e = load i32, i32* %f
+  br i1 undef, label %i, label %j
+
+i:                                                ; preds = %d
+  br i1 undef, label %k, label %m
+
+k:                                                ; preds = %i
+  br label %l
+
+l:                                                ; preds = %k
+  %n = phi i32 [ %e, %k ]
+  ;; Becomes equal and then not equal to the other store, and
+  ;; along the way, the load.
+  store i32 %n, i32* %f
+  unreachable
+
+j:                                                ; preds = %d
+  br label %m
+}
diff --git a/test/Transforms/NewGVN/rle-nonlocal.ll b/test/Transforms/NewGVN/rle-nonlocal.ll
index 89f5a6affdec..d318cd5240d8 100644
--- a/test/Transforms/NewGVN/rle-nonlocal.ll
+++ b/test/Transforms/NewGVN/rle-nonlocal.ll
@@ -1,23 +1,37 @@
-; XFAIL: *
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
 
 define i32 @main(i32** %p, i32 %x, i32 %y) {
+; CHECK-LABEL: @main(
+; CHECK-NEXT:  block1:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[BLOCK2:%.*]], label [[BLOCK3:%.*]]
+; CHECK:       block2:
+; CHECK-NEXT:    [[A:%.*]] = load i32*, i32** [[P:%.*]]
+; CHECK-NEXT:    br label [[BLOCK4:%.*]]
+; CHECK:       block3:
+; CHECK-NEXT:    [[B:%.*]] = load i32*, i32** [[P]]
+; CHECK-NEXT:    br label [[BLOCK4]]
+; CHECK:       block4:
+; CHECK-NEXT:    [[EXISTINGPHI:%.*]] = phi i32* [ [[A]], [[BLOCK2]] ], [ [[B]], [[BLOCK3]] ]
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[EXISTINGPHI]]
+; CHECK-NEXT:    [[E:%.*]] = add i32 [[C]], [[C]]
+; CHECK-NEXT:    ret i32 [[E]]
+;
 block1:
-    %cmp = icmp eq i32 %x, %y
-	br i1 %cmp , label %block2, label %block3
+  %cmp = icmp eq i32 %x, %y
+  br i1 %cmp , label %block2, label %block3
 
 block2:
- %a = load i32*, i32** %p
- br label %block4
+  %a = load i32*, i32** %p
+  br label %block4
 
 block3:
   %b = load i32*, i32** %p
   br label %block4
 
 block4:
-; CHECK-NOT: %existingPHI = phi
-; CHECK: %DEAD = phi
-  %existingPHI = phi i32* [ %a, %block2 ], [ %b, %block3 ] 
+  %existingPHI = phi i32* [ %a, %block2 ], [ %b, %block3 ]
   %DEAD = load i32*, i32** %p
   %c = load i32, i32* %DEAD
   %d = load i32, i32* %existingPHI
diff --git a/test/Transforms/NewGVN/rle.ll b/test/Transforms/NewGVN/rle.ll
new file mode 100644
index 000000000000..902abe979ea8
--- /dev/null
+++ b/test/Transforms/NewGVN/rle.ll
@@ -0,0 +1,59 @@
+; RUN: opt < %s -data-layout="e-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-n8:16:32" -basicaa -newgvn -S -die | FileCheck %s
+; RUN: opt < %s -data-layout="E-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-n32"      -basicaa -newgvn -S -die | FileCheck %s
+; memset -> i16 forwarding.
+define signext i16 @memset_to_i16_local(i16* %A) nounwind ssp {
+entry:
+  %conv = bitcast i16* %A to i8*
+  tail call void @llvm.memset.p0i8.i64(i8* %conv, i8 1, i64 200, i32 1, i1 false)
+  %arrayidx = getelementptr inbounds i16, i16* %A, i64 42
+  %tmp2 = load i16, i16* %arrayidx
+  ret i16 %tmp2
+; CHECK-LABEL: @memset_to_i16_local(
+; CHECK-NOT: load
+; CHECK: ret i16 257
+}
+
+@GCst = constant {i32, float, i32 } { i32 42, float 14., i32 97 }
+@GCst_as1 = addrspace(1) constant {i32, float, i32 } { i32 42, float 14., i32 97 }
+
+; memset -> float forwarding.
+define float @memcpy_to_float_local(float* %A) nounwind ssp {
+entry:
+  %conv = bitcast float* %A to i8*                ; <i8*> [#uses=1]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %conv, i8* bitcast ({i32, float, i32 }* @GCst to i8*), i64 12, i32 1, i1 false)
+  %arrayidx = getelementptr inbounds float, float* %A, i64 1 ; <float*> [#uses=1]
+  %tmp2 = load float, float* %arrayidx                   ; <float> [#uses=1]
+  ret float %tmp2
+; CHECK-LABEL: @memcpy_to_float_local(
+; CHECK-NOT: load
+; CHECK: ret float 1.400000e+01
+}
+; memcpy from address space 1
+define float @memcpy_to_float_local_as1(float* %A) nounwind ssp {
+entry:
+  %conv = bitcast float* %A to i8*                ; <i8*> [#uses=1]
+  tail call void @llvm.memcpy.p0i8.p1i8.i64(i8* %conv, i8 addrspace(1)* bitcast ({i32, float, i32 } addrspace(1)* @GCst_as1 to i8 addrspace(1)*), i64 12, i32 1, i1 false)
+  %arrayidx = getelementptr inbounds float, float* %A, i64 1 ; <float*> [#uses=1]
+  %tmp2 = load float, float* %arrayidx                   ; <float> [#uses=1]
+  ret float %tmp2
+; CHECK-LABEL: @memcpy_to_float_local_as1(
+; CHECK-NOT: load
+; CHECK: ret float 1.400000e+01
+}
+
+; PR6642
+define i32 @memset_to_load() nounwind readnone {
+entry:
+  %x = alloca [256 x i32], align 4                ; <[256 x i32]*> [#uses=2]
+  %tmp = bitcast [256 x i32]* %x to i8*           ; <i8*> [#uses=1]
+  call void @llvm.memset.p0i8.i64(i8* %tmp, i8 0, i64 1024, i32 4, i1 false)
+  %arraydecay = getelementptr inbounds [256 x i32], [256 x i32]* %x, i32 0, i32 0 ; <i32*>
+  %tmp1 = load i32, i32* %arraydecay                   ; <i32> [#uses=1]
+  ret i32 %tmp1
+; CHECK-LABEL: @memset_to_load(
+; CHECK: ret i32 0
+}
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind
diff --git a/test/Transforms/NewGVN/storeoverstore.ll b/test/Transforms/NewGVN/storeoverstore.ll
index 63f40c511e3c..49b55d430dc7 100644
--- a/test/Transforms/NewGVN/storeoverstore.ll
+++ b/test/Transforms/NewGVN/storeoverstore.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -newgvn -S < %s | FileCheck %s
 ; RUN: opt -passes=newgvn -S -o - %s | FileCheck %s
 
@@ -7,31 +8,35 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 ;; stores of the same value do not change the memory state to eliminate them.
 
 define i32 @foo(i32*, i32)  {
-; CHECK-LABEL: @foo
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    store i32 5, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP5:%.*]]
+; CHECK:         br label [[TMP5]]
+; CHECK:         [[DOT0:%.*]] = phi i32 [ 10, [[TMP4]] ], [ 5, [[TMP2:%.*]] ]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP6:%.*]], label [[TMP8:%.*]]
+; CHECK:         [[TMP7:%.*]] = add nsw i32 [[DOT0]], 5
+; CHECK-NEXT:    br label [[TMP8]]
+; CHECK:         [[DOT1:%.*]] = phi i32 [ [[TMP7]], [[TMP6]] ], [ [[DOT0]], [[TMP5]] ]
+; CHECK-NEXT:    ret i32 [[DOT1]]
+;
   store i32 5, i32* %0, align 4
   %3 = icmp ne i32 %1, 0
   br i1 %3, label %4, label %7
 
 ; <label>:4:                                      ; preds = %2
-; CHECK-NOT: load
   %5 = load i32, i32* %0, align 4
-; CHECK-NOT: add
   %6 = add nsw i32 5, %5
   br label %7
 
 ; <label>:7:                                      ; preds = %4, %2
   %.0 = phi i32 [ %6, %4 ], [ 5, %2 ]
-; CHECK: phi i32 [ 10, %4 ], [ 5, %2 ]
   store i32 5, i32* %0, align 4
-; CHECK-NOT: icmp
   %8 = icmp ne i32 %1, 0
-; CHECK: br i1 %3
   br i1 %8, label %9, label %12
 
 ; <label>:9:                                      ; preds = %7
-; CHECK-NOT: load
   %10 = load i32, i32* %0, align 4
-; CHECK: add nsw i32 %.0, 5
   %11 = add nsw i32 %.0, %10
   br label %12
 
@@ -43,15 +48,25 @@ define i32 @foo(i32*, i32)  {
 ;; This is similar to the above, but it is a conditional store of the same value
 ;; which requires value numbering MemoryPhi properly to resolve.
 define i32 @foo2(i32*, i32)  {
-; CHECK-LABEL: @foo2
+; CHECK-LABEL: @foo2(
+; CHECK-NEXT:    store i32 5, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP5:%.*]]
+; CHECK:         br label [[TMP6:%.*]]
+; CHECK:         br label [[TMP6]]
+; CHECK:         [[DOT0:%.*]] = phi i32 [ 10, [[TMP4]] ], [ 5, [[TMP5]] ]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; CHECK:         [[TMP8:%.*]] = add nsw i32 [[DOT0]], 5
+; CHECK-NEXT:    br label [[TMP9]]
+; CHECK:         [[DOT1:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ [[DOT0]], [[TMP6]] ]
+; CHECK-NEXT:    ret i32 [[DOT1]]
+;
   store i32 5, i32* %0, align 4
   %3 = icmp ne i32 %1, 0
   br i1 %3, label %4, label %7
 
 ; <label>:4:                                      ; preds = %2
-; CHECK-NOT: load
   %5 = load i32, i32* %0, align 4
-; CHECK-NOT: add
   %6 = add nsw i32 5, %5
   br label %8
 
@@ -60,17 +75,12 @@ define i32 @foo2(i32*, i32)  {
   br label %8
 
 ; <label>:8:                                      ; preds = %7, %4
-; CHECK: phi i32 [ 10, %4 ], [ 5, %5 ]
   %.0 = phi i32 [ %6, %4 ], [ 5, %7 ]
-; CHECK-NOT: icmp
   %9 = icmp ne i32 %1, 0
-; CHECK: br i1 %3
   br i1 %9, label %10, label %13
 
 ; <label>:10:                                     ; preds = %8
-; CHECK-NOT: load
   %11 = load i32, i32* %0, align 4
-; CHECK: add nsw i32 %.0, 5
   %12 = add nsw i32 %.0, %11
   br label %13
 
diff --git a/test/Transforms/NewGVN/tbaa.ll b/test/Transforms/NewGVN/tbaa.ll
index 47e20fae7f9c..3dcc4f8acc14 100644
--- a/test/Transforms/NewGVN/tbaa.ll
+++ b/test/Transforms/NewGVN/tbaa.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; RUN: opt -tbaa -basicaa -newgvn -S < %s | FileCheck %s
 
 define i32 @test1(i8* %p, i8* %q) {
diff --git a/test/Transforms/NewGVN/volatile-nonvolatile.ll b/test/Transforms/NewGVN/volatile-nonvolatile.ll
index 8c74f8b28efb..46d29bad0f4d 100644
--- a/test/Transforms/NewGVN/volatile-nonvolatile.ll
+++ b/test/Transforms/NewGVN/volatile-nonvolatile.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; RUN: opt -tbaa -newgvn -S < %s | FileCheck %s
 
 %struct.t = type { i32* }
diff --git a/test/Transforms/ObjCARC/contract-storestrong.ll b/test/Transforms/ObjCARC/contract-storestrong.ll
index 2b83bdb9bfbf..a02f7b701912 100644
--- a/test/Transforms/ObjCARC/contract-storestrong.ll
+++ b/test/Transforms/ObjCARC/contract-storestrong.ll
@@ -243,6 +243,19 @@ entry:
   ret void
 }
 
+; This used to crash.
+; CHECK-LABEL: define i8* @test13(
+; CHECK: tail call void @objc_storeStrong(i8** %{{.*}}, i8* %[[NEW:.*]])
+; CHECK-NEXT: ret i8* %[[NEW]]
+
+define i8* @test13(i8* %a0, i8* %a1, i8** %addr, i8* %new) {
+  %old = load i8*, i8** %addr, align 8
+  call void @objc_release(i8* %old)
+  %retained = call i8* @objc_retain(i8* %new)
+  store i8* %retained, i8** %addr, align 8
+  ret i8* %retained
+}
+
 !0 = !{}
 
 ; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/PGOProfile/Inputs/memop_size_annotation.proftext b/test/Transforms/PGOProfile/Inputs/memop_size_annotation.proftext
new file mode 100644
index 000000000000..400b29df3036
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/memop_size_annotation.proftext
@@ -0,0 +1,27 @@
+# IR level Instrumentation Flag
+:ir
+foo
+# Func Hash:
+53929068288
+# Num Counters:
+3
+# Counter Values:
+556
+20
+1
+# Num Value Kinds:
+1
+# ValueKind = IPVK_MemOPSize:
+1
+# NumValueSites:
+1
+9
+7:33
+2:88
+9:72
+4:66
+1:99
+5:55
+6:44
+3:77
+8:22
diff --git a/test/Transforms/PGOProfile/Inputs/thinlto_samplepgo_icp.ll b/test/Transforms/PGOProfile/Inputs/thinlto_samplepgo_icp.ll
new file mode 100644
index 000000000000..22860f52b5d3
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/thinlto_samplepgo_icp.ll
@@ -0,0 +1,27 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@fptr = external local_unnamed_addr global void ()*, align 8
+
+; Function Attrs: norecurse nounwind uwtable
+define void @_Z6updatei(i32 %i) local_unnamed_addr #0 {
+entry:
+  store void ()* @_ZL3foov, void ()** @fptr, align 8
+  ret void
+}
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define internal void @_ZL3foov() #1 {
+entry:
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+!llvm.ident = !{!31}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 297016)", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2)
+!1 = !DIFile(filename: "b.cc", directory: "/ssd/llvm/abc/small")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!31 = !{!"clang version 5.0.0 (trunk 297016)"}
diff --git a/test/Transforms/PGOProfile/comdat_internal.ll b/test/Transforms/PGOProfile/comdat_internal.ll
index 7df6f91fe729..74630179105a 100644
--- a/test/Transforms/PGOProfile/comdat_internal.ll
+++ b/test/Transforms/PGOProfile/comdat_internal.ll
@@ -12,11 +12,11 @@ $foo = comdat any
 @bar = global i32 ()* @foo, align 8
 
 ; CHECK: @__llvm_profile_raw_version = constant i64 {{[0-9]+}}, comdat
-; CHECK: @__profn__stdin__foo = private constant [11 x i8] c"<stdin>:foo"
+; CHECK-NOT: __profn__stdin__foo
 ; CHECK: @__profc__stdin__foo.[[FOO_HASH]] = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat($__profv__stdin__foo.[[FOO_HASH]]), align 8
-; CHECK: @__profd__stdin__foo.[[FOO_HASH]] = private global { i64, i64, i64*, i8*, i8*, i32, [1 x i16] } { i64 -5640069336071256030, i64 [[FOO_HASH]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc__stdin__foo.[[FOO_HASH]], i32 0, i32 0), i8* null
+; CHECK: @__profd__stdin__foo.[[FOO_HASH]] = private global { i64, i64, i64*, i8*, i8*, i32, [2 x i16] } { i64 -5640069336071256030, i64 [[FOO_HASH]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc__stdin__foo.[[FOO_HASH]], i32 0, i32 0), i8* null
 ; CHECK-NOT: bitcast (i32 ()* @foo to i8*)
-; CHECK-SAME: , i8* null, i32 1, [1 x i16] zeroinitializer }, section "__llvm_prf_data", comdat($__profv__stdin__foo.[[FOO_HASH]]), align 8
+; CHECK-SAME: , i8* null, i32 1, [2 x i16] zeroinitializer }, section "__llvm_prf_data", comdat($__profv__stdin__foo.[[FOO_HASH]]), align 8
 ; CHECK: @__llvm_prf_nm
 ; CHECK: @llvm.used
 
diff --git a/test/Transforms/PGOProfile/indirect_call_promotion.ll b/test/Transforms/PGOProfile/indirect_call_promotion.ll
index c35166505eb9..b892c130152c 100644
--- a/test/Transforms/PGOProfile/indirect_call_promotion.ll
+++ b/test/Transforms/PGOProfile/indirect_call_promotion.ll
@@ -1,4 +1,6 @@
 ; RUN: opt < %s -pgo-icall-prom -S | FileCheck %s --check-prefix=ICALL-PROM
+; RUN: opt < %s -pgo-icall-prom -S -icp-samplepgo | FileCheck %s --check-prefix=ICALL-PROM
+; RUN: opt < %s -pgo-icall-prom -S -icp-samplepgo | FileCheck %s --check-prefix=ICALL-PROM-SAMPLEPGO
 ; RUN: opt < %s -passes=pgo-icall-prom -S | FileCheck %s --check-prefix=ICALL-PROM
 ; RUN: opt < %s -pgo-icall-prom -S -pass-remarks=pgo-icall-prom -icp-count-threshold=0 -icp-percent-threshold=0 -icp-max-prom=4 2>&1 | FileCheck %s --check-prefix=PASS-REMARK
 ; RUN: opt < %s -passes=pgo-icall-prom -S -pass-remarks=pgo-icall-prom -icp-count-threshold=0 -icp-percent-threshold=0 -icp-max-prom=4 2>&1 | FileCheck %s --check-prefix=PASS-REMARK
@@ -40,6 +42,7 @@ entry:
 ; ICALL-PROM:   br i1 [[CMP]], label %if.true.direct_targ, label %if.false.orig_indirect, !prof [[BRANCH_WEIGHT:![0-9]+]]
 ; ICALL-PROM: if.true.direct_targ:
 ; ICALL-PROM:   [[DIRCALL_RET:%[0-9]+]] = call i32 @func4()
+; ICALL-PROM-SAMPLEPGO: call i32 @func4(), !prof [[CALL_METADATA:![0-9]+]]
 ; ICALL-PROM:   br label %if.end.icp
   %call = call i32 %tmp(), !prof !1
 ; ICALL-PROM: if.false.orig_indirect:
@@ -54,3 +57,4 @@ entry:
 
 ; ICALL-PROM: [[BRANCH_WEIGHT]] = !{!"branch_weights", i32 1030, i32 570}
 ; ICALL-PROM: [[NEW_VP_METADATA]] = !{!"VP", i32 0, i64 570, i64 -4377547752858689819, i64 410}
+; ICALL-PROM-SAMPLEPGO: [[CALL_METADATA]] = !{!"branch_weights", i32 1030}
diff --git a/test/Transforms/PGOProfile/memcpy.ll b/test/Transforms/PGOProfile/memcpy.ll
new file mode 100644
index 000000000000..9db4a4a2dd4c
--- /dev/null
+++ b/test/Transforms/PGOProfile/memcpy.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s -pgo-instr-gen -instrprof -S | FileCheck %s
+; RUN: opt <%s -passes=pgo-instr-gen,instrprof -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo(i8* %dst, i8* %src, i32* %a, i32 %n) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %i.0 = phi i32 [ 0, %entry ], [ %add, %for.cond1 ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.cond1, label %for.end6
+
+for.cond1:
+  %j.0 = phi i32 [ %inc, %for.body3 ], [ 0, %for.cond ]
+  %idx.ext = sext i32 %i.0 to i64
+  %add.ptr = getelementptr inbounds i32, i32* %a, i64 %idx.ext
+  %0 = load i32, i32* %add.ptr, align 4
+  %cmp2 = icmp slt i32 %j.0, %0
+  %add = add nsw i32 %i.0, 1
+  br i1 %cmp2, label %for.body3, label %for.cond
+
+for.body3:
+  %conv = sext i32 %add to i64
+; CHECK: call void @__llvm_profile_instrument_range(i64 %conv, i8* bitcast ({ i64, i64, i64*, i8*, i8*, i32, [2 x i16] }* @__profd_foo to i8*), i32 0, i64 0, i64 8, i64 8192)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %conv, i32 1, i1 false)
+  %inc = add nsw i32 %j.0, 1
+  br label %for.cond1
+
+for.end6:
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
diff --git a/test/Transforms/PGOProfile/memop_size_annotation.ll b/test/Transforms/PGOProfile/memop_size_annotation.ll
new file mode 100644
index 000000000000..5481d12b1af1
--- /dev/null
+++ b/test/Transforms/PGOProfile/memop_size_annotation.ll
@@ -0,0 +1,59 @@
+; RUN: llvm-profdata merge %S/Inputs/memop_size_annotation.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -memop-max-annotations=9 -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefixes=MEMOP_ANNOTATION,MEMOP_ANNOTATION9
+; RUN: opt < %s -passes=pgo-instr-use -memop-max-annotations=9 -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefixes=MEMOP_ANNOTATION,MEMOP_ANNOTATION9
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefixes=MEMOP_ANNOTATION,MEMOP_ANNOTATION4
+; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefixes=MEMOP_ANNOTATION,MEMOP_ANNOTATION4
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo(i8* %dst, i8* %src, i32* %a, i32 %n) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %i.0 = phi i32 [ 0, %entry ], [ %inc5, %for.inc4 ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end6
+
+for.body:
+  br label %for.cond1
+
+for.cond1:
+  %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ]
+  %idx.ext = sext i32 %i.0 to i64
+  %add.ptr = getelementptr inbounds i32, i32* %a, i64 %idx.ext
+  %0 = load i32, i32* %add.ptr, align 4
+  %cmp2 = icmp slt i32 %j.0, %0
+  br i1 %cmp2, label %for.body3, label %for.end
+
+for.body3:
+  %add = add nsw i32 %i.0, 1
+  %conv = sext i32 %add to i64
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %conv, i32 1, i1 false)
+; MEMOP_ANNOTATION: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %conv, i32 1, i1 false)
+; MEMOP_ANNOTATION-SAME: !prof ![[MEMOP_VALUESITE:[0-9]+]]
+; MEMOP_ANNOTATION9: ![[MEMOP_VALUESITE]] = !{!"VP", i32 1, i64 556, i64 1, i64 99, i64 2, i64 88, i64 3, i64 77, i64 9, i64 72, i64 4, i64 66, i64 5, i64 55, i64 6, i64 44, i64 7, i64 33, i64 8, i64 22}
+; MEMOP_ANNOTATION4: ![[MEMOP_VALUESITE]] = !{!"VP", i32 1, i64 556, i64 1, i64 99, i64 2, i64 88, i64 3, i64 77, i64 9, i64 72}
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i32 %j.0, 1
+  br label %for.cond1
+
+for.end:
+  br label %for.inc4
+
+for.inc4:
+  %inc5 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end6:
+  ret void
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+
+declare void @llvm.lifetime.end(i64, i8* nocapture)
diff --git a/test/Transforms/PGOProfile/memop_size_opt.ll b/test/Transforms/PGOProfile/memop_size_opt.ll
new file mode 100644
index 000000000000..c7c42f3c1d33
--- /dev/null
+++ b/test/Transforms/PGOProfile/memop_size_opt.ll
@@ -0,0 +1,100 @@
+; RUN: opt < %s -passes=pgo-memop-opt -pgo-memop-count-threshold=90 -pgo-memop-percent-threshold=15 -S | FileCheck %s --check-prefix=MEMOP_OPT
+; RUN: opt < %s -pgo-memop-opt -pgo-memop-count-threshold=90 -pgo-memop-percent-threshold=15 -S | FileCheck %s --check-prefix=MEMOP_OPT
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo(i8* %dst, i8* %src, i32* %a, i32 %n) !prof !27 {
+entry:
+  br label %for.cond
+
+for.cond:
+  %i.0 = phi i32 [ 0, %entry ], [ %inc5, %for.inc4 ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end6, !prof !28
+
+for.body:
+  br label %for.cond1
+
+for.cond1:
+  %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ]
+  %idx.ext = sext i32 %i.0 to i64
+  %add.ptr = getelementptr inbounds i32, i32* %a, i64 %idx.ext
+  %0 = load i32, i32* %add.ptr, align 4
+  %cmp2 = icmp slt i32 %j.0, %0
+  br i1 %cmp2, label %for.body3, label %for.end, !prof !29
+
+for.body3:
+  %add = add nsw i32 %i.0, 1
+  %conv = sext i32 %add to i64
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %conv, i32 1, i1 false), !prof !30
+  br label %for.inc
+
+; MEMOP_OPT:  switch i64 %conv, label %[[Default_LABEL:.*]] [
+; MEMOP_OPT:    i64 1, label %[[CASE_1_LABEL:.*]]
+; MEMOP_OPT:  ], !prof [[SWITCH_BW:![0-9]+]] 
+; MEMOP_OPT: [[CASE_1_LABEL]]:
+; MEMOP_OPT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 1, i32 1, i1 false)
+; MEMOP_OPT:   br label %[[MERGE_LABEL:.*]]
+; MEMOP_OPT: [[Default_LABEL]]:
+; MEMOP_OPT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %conv, i32 1, i1 false)
+; MEMOP_OPT-NOT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %conv, i32 1, i1 false), !prof
+; MEMOP_OPT:   br label %[[MERGE_LABEL]]
+; MEMOP_OPT: [[MERGE_LABEL]]:
+; MEMOP_OPT:   br label %for.inc
+; MEMOP_OPT: [[SWITCH_BW]] = !{!"branch_weights", i32 457, i32 99}
+
+for.inc:
+  %inc = add nsw i32 %j.0, 1
+  br label %for.cond1
+
+for.end:
+  br label %for.inc4
+
+for.inc4:
+  %inc5 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end6:
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 579}
+!4 = !{!"MaxCount", i64 556}
+!5 = !{!"MaxInternalCount", i64 20}
+!6 = !{!"MaxFunctionCount", i64 556}
+!7 = !{!"NumCounts", i64 6}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13, !14, !15, !16, !16, !17, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26}
+!11 = !{i32 10000, i64 556, i32 1}
+!12 = !{i32 100000, i64 556, i32 1}
+!13 = !{i32 200000, i64 556, i32 1}
+!14 = !{i32 300000, i64 556, i32 1}
+!15 = !{i32 400000, i64 556, i32 1}
+!16 = !{i32 500000, i64 556, i32 1}
+!17 = !{i32 600000, i64 556, i32 1}
+!18 = !{i32 700000, i64 556, i32 1}
+!19 = !{i32 800000, i64 556, i32 1}
+!20 = !{i32 900000, i64 556, i32 1}
+!21 = !{i32 950000, i64 556, i32 1}
+!22 = !{i32 990000, i64 20, i32 2}
+!23 = !{i32 999000, i64 1, i32 5}
+!24 = !{i32 999900, i64 1, i32 5}
+!25 = !{i32 999990, i64 1, i32 5}
+!26 = !{i32 999999, i64 1, i32 5}
+!27 = !{!"function_entry_count", i64 1}
+!28 = !{!"branch_weights", i32 20, i32 1}
+!29 = !{!"branch_weights", i32 556, i32 20}
+!30 = !{!"VP", i32 1, i64 556, i64 1, i64 99, i64 2, i64 88, i64 3, i64 77, i64 9, i64 72, i64 4, i64 66, i64 5, i64 55, i64 6, i64 44, i64 7, i64 33, i64 8, i64 22}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+
+declare void @llvm.lifetime.end(i64, i8* nocapture)
diff --git a/test/Transforms/PGOProfile/multiple_hash_profile.ll b/test/Transforms/PGOProfile/multiple_hash_profile.ll
index f4041830f8f8..6da94826a954 100644
--- a/test/Transforms/PGOProfile/multiple_hash_profile.ll
+++ b/test/Transforms/PGOProfile/multiple_hash_profile.ll
@@ -27,8 +27,8 @@ entry:
   %cmp.i = icmp sgt i32 %i, 2
   %mul.i = select i1 %cmp.i, i32 1, i32 %i
 ; CHECK: %mul.i = select i1 %cmp.i, i32 1, i32 %i
-; CHECK-SAME !prof ![[BW:[0-9]+]]
-; CHECK ![[BW]] = !{!"branch_weights", i32 12, i32 6}
+; CHECK-SAME: !prof ![[BW:[0-9]+]]
+; CHECK: ![[BW]] = !{!"branch_weights", i32 12, i32 6}
   %retval.0.i = mul nsw i32 %mul.i, %i
   ret i32 %retval.0.i
 }
diff --git a/test/Transforms/PGOProfile/statics_counter_naming.ll b/test/Transforms/PGOProfile/statics_counter_naming.ll
index c882406ffe54..c329ddba9300 100644
--- a/test/Transforms/PGOProfile/statics_counter_naming.ll
+++ b/test/Transforms/PGOProfile/statics_counter_naming.ll
@@ -1,9 +1,14 @@
-; RUN: opt %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
-; RUN: opt %s -passes=pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
+; RUN: opt %s -pgo-instr-gen -static-func-full-module-prefix=false -S | FileCheck %s --check-prefix=NOPATH
+; RUN: opt %s -passes=pgo-instr-gen -static-func-full-module-prefix=false -S | FileCheck %s --check-prefix=NOPATH
+; RUN: opt %s --pgo-instr-gen -static-func-strip-dirname-prefix=1000 -S | FileCheck %s --check-prefix=NOPATH
+; RUN: opt %s -passes=pgo-instr-gen -static-func-strip-dirname-prefix=1000 -S | FileCheck %s --check-prefix=NOPATH
+; RUN: opt %s --pgo-instr-gen -static-func-strip-dirname-prefix=1 -S | FileCheck %s --check-prefix=HASPATH
+; RUN: opt %s -passes=pgo-instr-gen -static-func-strip-dirname-prefix=1 -S | FileCheck %s --check-prefix=HASPATH
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-; GEN: @__profn_statics_counter_naming.ll_func = private constant [30 x i8] c"statics_counter_naming.ll:func"
+; NOPATH: @__profn_statics_counter_naming.ll_func = private constant [30 x i8] c"statics_counter_naming.ll:func"
+; HASPATH-NOT: @__profn_statics_counter_naming.ll_func = private constant [30 x i8] c"statics_counter_naming.ll:func"
 
 define internal i32 @func() {
 entry:
diff --git a/test/Transforms/PGOProfile/thinlto_samplepgo_icp.ll b/test/Transforms/PGOProfile/thinlto_samplepgo_icp.ll
new file mode 100644
index 000000000000..dfb6816db5f2
--- /dev/null
+++ b/test/Transforms/PGOProfile/thinlto_samplepgo_icp.ll
@@ -0,0 +1,63 @@
+; Do setup work for all below tests: generate bitcode and combined index
+; RUN: opt -module-summary %s -o %t.bc
+; RUN: opt -module-summary %p/Inputs/thinlto_samplepgo_icp.ll -o %t2.bc
+; RUN: llvm-lto -thinlto -o %t3 %t.bc %t2.bc
+
+; Checks if calls to static target functions are properly imported and promoted
+; by ICP. Note that the GUID in the profile is from the oroginal name.
+; RUN: opt -function-import -summary-file %t3.thinlto.bc %t.bc -o %t4.bc -print-imports 2>&1 | FileCheck %s --check-prefix=IMPORTS
+; IMPORTS: Import _ZL3foov.llvm.0
+; RUN: opt %t4.bc -icp-lto -pgo-icall-prom -S -icp-count-threshold=1 | FileCheck %s --check-prefix=ICALL-PROM
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@fptr = local_unnamed_addr global void ()* null, align 8
+
+; Function Attrs: norecurse uwtable
+define i32 @main() local_unnamed_addr #0 !prof !34 {
+entry:
+  %0 = load void ()*, void ()** @fptr, align 8
+; ICALL-PROM:   br i1 %{{[0-9]+}}, label %if.true.direct_targ, label %if.false.orig_indirect
+  tail call void %0(), !prof !40
+  ret i32 0
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3,!4}
+!llvm.ident = !{!31}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 297016)", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2)
+!1 = !DIFile(filename: "main.cc", directory: ".")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"ProfileSummary", !5}
+!5 = !{!6, !7, !8, !9, !10, !11, !12, !13}
+!6 = !{!"ProfileFormat", !"SampleProfile"}
+!7 = !{!"TotalCount", i64 3003}
+!8 = !{!"MaxCount", i64 3000}
+!9 = !{!"MaxInternalCount", i64 0}
+!10 = !{!"MaxFunctionCount", i64 0}
+!11 = !{!"NumCounts", i64 3}
+!12 = !{!"NumFunctions", i64 1}
+!13 = !{!"DetailedSummary", !14}
+!14 = !{!15, !16, !17, !18, !19, !20, !20, !21, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30}
+!15 = !{i32 10000, i64 3000, i32 1}
+!16 = !{i32 100000, i64 3000, i32 1}
+!17 = !{i32 200000, i64 3000, i32 1}
+!18 = !{i32 300000, i64 3000, i32 1}
+!19 = !{i32 400000, i64 3000, i32 1}
+!20 = !{i32 500000, i64 3000, i32 1}
+!21 = !{i32 600000, i64 3000, i32 1}
+!22 = !{i32 700000, i64 3000, i32 1}
+!23 = !{i32 800000, i64 3000, i32 1}
+!24 = !{i32 900000, i64 3000, i32 1}
+!25 = !{i32 950000, i64 3000, i32 1}
+!26 = !{i32 990000, i64 3000, i32 1}
+!27 = !{i32 999000, i64 3000, i32 1}
+!28 = !{i32 999900, i64 2, i32 2}
+!29 = !{i32 999990, i64 2, i32 2}
+!30 = !{i32 999999, i64 2, i32 2}
+!31 = !{!"clang version 5.0.0 (trunk 297016)"}
+!34 = !{!"function_entry_count", i64 1}
+!40 = !{!"VP", i32 0, i64 3000, i64 -8789629626369651636, i64 3000}
diff --git a/test/Transforms/RewriteStatepointsForGC/base-vector.ll b/test/Transforms/RewriteStatepointsForGC/base-vector.ll
index 9026275cf682..c34462f45169 100644
--- a/test/Transforms/RewriteStatepointsForGC/base-vector.ll
+++ b/test/Transforms/RewriteStatepointsForGC/base-vector.ll
@@ -88,6 +88,7 @@ entry:
 }
 
 declare void @use(i64 addrspace(1)*) "gc-leaf-function"
+declare void @use_vec(<4 x i64 addrspace(1)*>) "gc-leaf-function"
 
 define void @test5(i1 %cnd, i64 addrspace(1)* %obj) gc "statepoint-example" {
 ; CHECK-LABEL: @test5
@@ -245,3 +246,17 @@ next:
   ret i64 addrspace(1)* %bdv
 }
 declare void @do_safepoint()
+
+define void @test11(<4 x i64 addrspace(1)*> %vec1) gc "statepoint-example" {
+; CHECK-LABEL: @test11(
+; CHECK: @llvm.experimental.gc.statepoint.p0f_isVoidf{{.*}}<4 x i64 addrspace(1)*> %vec1)
+; CHECK: %vec1.relocated = call coldcc <4 x i8 addrspace(1)*> @llvm.experimental.gc.relocate.v4p1i8
+; CHECK: %vec1.relocated.casted = bitcast <4 x i8 addrspace(1)*> %vec1.relocated to <4 x i64 addrspace(1)*>
+; CHECK: %vec2.remat = getelementptr i64, <4 x i64 addrspace(1)*> %vec1.relocated.casted, i32 1024
+; CHECK: call void @use_vec(<4 x i64 addrspace(1)*> %vec2.remat)
+entry:
+  %vec2 = getelementptr i64, <4 x i64 addrspace(1)*> %vec1, i32 1024
+  call void @do_safepoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  call void @use_vec(<4 x i64 addrspace(1) *> %vec2)
+  ret void
+}
diff --git a/test/Transforms/SCCP/indirectbr.ll b/test/Transforms/SCCP/indirectbr.ll
new file mode 100644
index 000000000000..b977961ca49b
--- /dev/null
+++ b/test/Transforms/SCCP/indirectbr.ll
@@ -0,0 +1,76 @@
+; RUN: opt -S -sccp < %s | FileCheck %s
+
+declare void @BB0_f()
+declare void @BB1_f()
+
+; Make sure we can eliminate what is in BB0 as we know that the indirectbr is going to BB1.
+;
+; CHECK-LABEL: define void @indbrtest1(
+; CHECK-NOT: call void @BB0_f()
+; CHECK: ret void
+define void @indbrtest1() {
+entry:
+  indirectbr i8* blockaddress(@indbrtest1, %BB1), [label %BB0, label %BB1]
+BB0:
+  call void @BB0_f()
+  br label %BB1
+BB1:
+  call void @BB1_f()
+  ret void
+}
+
+; Make sure we can eliminate what is in BB0 as we know that the indirectbr is going to BB1
+; by looking through the casts. The casts should be folded away when they are visited
+; before the indirectbr instruction.
+;
+; CHECK-LABEL: define void @indbrtest2(
+; CHECK-NOT: call void @BB0_f()
+; CHECK: ret void
+define void @indbrtest2() {
+entry:
+  %a = ptrtoint i8* blockaddress(@indbrtest2, %BB1) to i64
+  %b = inttoptr i64 %a to i8*
+  %c = bitcast i8* %b to i8*
+  indirectbr i8* %b, [label %BB0, label %BB1]
+BB0:
+  call void @BB0_f()
+  br label %BB1
+BB1:
+  call void @BB1_f()
+  ret void
+}
+
+; Make sure we can not eliminate BB0 as we do not know the target of the indirectbr.
+;
+; CHECK-LABEL: define void @indbrtest3(
+; CHECK: call void @BB0_f()
+; CHECK: ret void
+define void @indbrtest3(i8** %Q) {
+entry:
+  %t = load i8*, i8** %Q
+  indirectbr i8* %t, [label %BB0, label %BB1]
+BB0:
+  call void @BB0_f()
+  br label %BB1
+BB1:
+  call void @BB1_f()
+  ret void
+}
+
+; Make sure we eliminate BB1 as we pick the first successor on undef.
+;
+; CHECK-LABEL: define void @indbrtest4(
+; CHECK: call void @BB0_f()
+; CHECK: ret void
+define void @indbrtest4(i8** %Q) {
+entry:
+  indirectbr i8* undef, [label %BB0, label %BB1]
+BB0:
+  call void @BB0_f()
+  br label %BB1
+BB1:
+  call void @BB1_f()
+  ret void
+}
+
+
diff --git a/test/Transforms/SCCP/loadtest.ll b/test/Transforms/SCCP/loadtest.ll
index b88b44b76040..89c7371625ad 100644
--- a/test/Transforms/SCCP/loadtest.ll
+++ b/test/Transforms/SCCP/loadtest.ll
@@ -1,7 +1,7 @@
 ; This test makes sure that these instructions are properly constant propagated.
 
-; RUN: opt < %s -default-data-layout="e-p:32:32" -sccp -S | FileCheck %s
-; RUN: opt < %s -default-data-layout="E-p:32:32" -sccp -S | FileCheck %s
+; RUN: opt < %s -data-layout="e-p:32:32" -sccp -S | FileCheck %s
+; RUN: opt < %s -data-layout="E-p:32:32" -sccp -S | FileCheck %s
 
 ; CHECK-NOT: load
 
diff --git a/test/Transforms/SCCP/overdefined-div.ll b/test/Transforms/SCCP/overdefined-div.ll
new file mode 100644
index 000000000000..f0b16155c178
--- /dev/null
+++ b/test/Transforms/SCCP/overdefined-div.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s -sccp -S | FileCheck %s
+
+; Test that SCCP has basic knowledge of when div can nuke overdefined values.
+
+; 0 / X = 0 even if X is overdefined.
+; CHECK-LABEL: test1
+; CHECK-NEXT: ret i32 0
+define i32 @test1(i32 %foo) {
+  %tinkywinky = udiv i32 0, %foo
+  ret i32 %tinkywinky
+}
+
+; CHECK-LABEL: test2
+; CHECK-NEXT: ret i32 0
+define i32 @test2(i32 %foo) {
+  %tinkywinky = sdiv i32 0, %foo
+  ret i32 %tinkywinky
+}
+
+; CHECK-LABEL: test3
+; CHECK: ret i32 %tinkywinky
+define i32 @test3(i32 %foo) {
+  %tinkywinky = udiv i32 %foo, 0
+  ret i32 %tinkywinky
+}
+
+; CHECK-LABEL: test4
+; CHECK: ret i32 %tinkywinky
+define i32 @test4(i32 %foo) {
+  %tinkywinky = sdiv i32 %foo, 0
+  ret i32 %tinkywinky
+}
diff --git a/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
index 2a9fc9e1c03e..b7fa5452f251 100644
--- a/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
+++ b/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -slp-vectorizer -S | FileCheck %s --check-prefix=DEFAULT
 ; RUN: opt < %s -slp-schedule-budget=0 -slp-min-tree-size=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=GATHER
 ; RUN: opt < %s -slp-schedule-budget=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=MAX-COST
@@ -8,7 +9,7 @@ target triple = "aarch64--linux-gnu"
 @a = common global [80 x i8] zeroinitializer, align 16
 
 ; DEFAULT-LABEL: @PR28330(
-; DEFAULT: %tmp17 = phi i32 [ %tmp34, %for.body ], [ 0, %entry ]
+; DEFAULT: %tmp17 = phi i32 [ %bin.extra, %for.body ], [ 0, %entry ]
 ; DEFAULT: %[[S0:.+]] = select <8 x i1> %1, <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
 ; DEFAULT: %[[R0:.+]] = shufflevector <8 x i32> %[[S0]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 ; DEFAULT: %[[R1:.+]] = add <8 x i32> %[[S0]], %[[R0]]
@@ -17,10 +18,10 @@ target triple = "aarch64--linux-gnu"
 ; DEFAULT: %[[R4:.+]] = shufflevector <8 x i32> %[[R3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; DEFAULT: %[[R5:.+]] = add <8 x i32> %[[R3]], %[[R4]]
 ; DEFAULT: %[[R6:.+]] = extractelement <8 x i32> %[[R5]], i32 0
-; DEFAULT: %tmp34 = add i32 %[[R6]], %tmp17
+; DEFAULT: %bin.extra = add i32 %[[R6]], %tmp17
 ;
 ; GATHER-LABEL: @PR28330(
-; GATHER: %tmp17 = phi i32 [ %tmp34, %for.body ], [ 0, %entry ]
+; GATHER: %tmp17 = phi i32 [ %bin.extra, %for.body ], [ 0, %entry ]
 ; GATHER: %tmp19 = select i1 %tmp1, i32 -720, i32 -80
 ; GATHER: %tmp21 = select i1 %tmp3, i32 -720, i32 -80
 ; GATHER: %tmp23 = select i1 %tmp5, i32 -720, i32 -80
@@ -44,7 +45,7 @@ target triple = "aarch64--linux-gnu"
 ; GATHER: %[[R4:.+]] = shufflevector <8 x i32> %[[R3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; GATHER: %[[R5:.+]] = add <8 x i32> %[[R3]], %[[R4]]
 ; GATHER: %[[R6:.+]] = extractelement <8 x i32> %[[R5]], i32 0
-; GATHER: %tmp34 = add i32 %[[R6]], %tmp17
+; GATHER: %bin.extra = add i32 %[[R6]], %tmp17
 ;
 ; MAX-COST-LABEL: @PR28330(
 ; MAX-COST-NOT: shufflevector
@@ -89,3 +90,126 @@ for.body:
   %tmp34 = add i32 %tmp32, %tmp33
   br label %for.body
 }
+
+define void @PR32038(i32 %n) {
+; DEFAULT-LABEL: @PR32038(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
+; DEFAULT-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
+; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
+; DEFAULT:       for.body:
+; DEFAULT-NEXT:    [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
+; DEFAULT-NEXT:    [[TMP20:%.*]] = add i32 -5, undef
+; DEFAULT-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], undef
+; DEFAULT-NEXT:    [[TMP24:%.*]] = add i32 [[TMP22]], undef
+; DEFAULT-NEXT:    [[TMP26:%.*]] = add i32 [[TMP24]], undef
+; DEFAULT-NEXT:    [[TMP28:%.*]] = add i32 [[TMP26]], undef
+; DEFAULT-NEXT:    [[TMP30:%.*]] = add i32 [[TMP28]], undef
+; DEFAULT-NEXT:    [[TMP32:%.*]] = add i32 [[TMP30]], undef
+; DEFAULT-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; DEFAULT-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP2]], [[RDX_SHUF]]
+; DEFAULT-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; DEFAULT-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; DEFAULT-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; DEFAULT-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; DEFAULT-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP3]], -5
+; DEFAULT-NEXT:    [[TMP34:%.*]] = add i32 [[TMP32]], undef
+; DEFAULT-NEXT:    br label [[FOR_BODY]]
+;
+; GATHER-LABEL: @PR32038(
+; GATHER-NEXT:  entry:
+; GATHER-NEXT:    [[TMP0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
+; GATHER-NEXT:    [[TMP1:%.*]] = icmp eq i8 [[TMP0]], 0
+; GATHER-NEXT:    [[TMP2:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2
+; GATHER-NEXT:    [[TMP3:%.*]] = icmp eq i8 [[TMP2]], 0
+; GATHER-NEXT:    [[TMP4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
+; GATHER-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0
+; GATHER-NEXT:    [[TMP6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
+; GATHER-NEXT:    [[TMP7:%.*]] = icmp eq i8 [[TMP6]], 0
+; GATHER-NEXT:    [[TMP8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
+; GATHER-NEXT:    [[TMP9:%.*]] = icmp eq i8 [[TMP8]], 0
+; GATHER-NEXT:    [[TMP10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
+; GATHER-NEXT:    [[TMP11:%.*]] = icmp eq i8 [[TMP10]], 0
+; GATHER-NEXT:    [[TMP12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
+; GATHER-NEXT:    [[TMP13:%.*]] = icmp eq i8 [[TMP12]], 0
+; GATHER-NEXT:    [[TMP14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
+; GATHER-NEXT:    [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0
+; GATHER-NEXT:    br label [[FOR_BODY:%.*]]
+; GATHER:       for.body:
+; GATHER-NEXT:    [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; GATHER-NEXT:    [[TMP19:%.*]] = select i1 [[TMP1]], i32 -720, i32 -80
+; GATHER-NEXT:    [[TMP20:%.*]] = add i32 -5, [[TMP19]]
+; GATHER-NEXT:    [[TMP21:%.*]] = select i1 [[TMP3]], i32 -720, i32 -80
+; GATHER-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP21]]
+; GATHER-NEXT:    [[TMP23:%.*]] = select i1 [[TMP5]], i32 -720, i32 -80
+; GATHER-NEXT:    [[TMP24:%.*]] = add i32 [[TMP22]], [[TMP23]]
+; GATHER-NEXT:    [[TMP25:%.*]] = select i1 [[TMP7]], i32 -720, i32 -80
+; GATHER-NEXT:    [[TMP26:%.*]] = add i32 [[TMP24]], [[TMP25]]
+; GATHER-NEXT:    [[TMP27:%.*]] = select i1 [[TMP9]], i32 -720, i32 -80
+; GATHER-NEXT:    [[TMP28:%.*]] = add i32 [[TMP26]], [[TMP27]]
+; GATHER-NEXT:    [[TMP29:%.*]] = select i1 [[TMP11]], i32 -720, i32 -80
+; GATHER-NEXT:    [[TMP30:%.*]] = add i32 [[TMP28]], [[TMP29]]
+; GATHER-NEXT:    [[TMP31:%.*]] = select i1 [[TMP13]], i32 -720, i32 -80
+; GATHER-NEXT:    [[TMP32:%.*]] = add i32 [[TMP30]], [[TMP31]]
+; GATHER-NEXT:    [[TMP33:%.*]] = select i1 [[TMP15]], i32 -720, i32 -80
+; GATHER-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP19]], i32 0
+; GATHER-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[TMP21]], i32 1
+; GATHER-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP23]], i32 2
+; GATHER-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP25]], i32 3
+; GATHER-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP27]], i32 4
+; GATHER-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP29]], i32 5
+; GATHER-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP31]], i32 6
+; GATHER-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP33]], i32 7
+; GATHER-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; GATHER-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP7]], [[RDX_SHUF]]
+; GATHER-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GATHER-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; GATHER-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GATHER-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; GATHER-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; GATHER-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP8]], -5
+; GATHER-NEXT:    [[TMP34:%.*]] = add i32 [[TMP32]], [[TMP33]]
+; GATHER-NEXT:    br label [[FOR_BODY]]
+;
+; MAX-COST-LABEL: @PR32038(
+entry:
+  %tmp0 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
+  %tmp1 = icmp eq i8 %tmp0, 0
+  %tmp2 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2
+  %tmp3 = icmp eq i8 %tmp2, 0
+  %tmp4 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
+  %tmp5 = icmp eq i8 %tmp4, 0
+  %tmp6 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
+  %tmp7 = icmp eq i8 %tmp6, 0
+  %tmp8 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
+  %tmp9 = icmp eq i8 %tmp8, 0
+  %tmp10 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
+  %tmp11 = icmp eq i8 %tmp10, 0
+  %tmp12 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
+  %tmp13 = icmp eq i8 %tmp12, 0
+  %tmp14 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
+  %tmp15 = icmp eq i8 %tmp14, 0
+  br label %for.body
+
+for.body:
+  %tmp17 = phi i32 [ %tmp34, %for.body ], [ 0, %entry ]
+  %tmp19 = select i1 %tmp1, i32 -720, i32 -80
+  %tmp20 = add i32 -5, %tmp19
+  %tmp21 = select i1 %tmp3, i32 -720, i32 -80
+  %tmp22 = add i32 %tmp20, %tmp21
+  %tmp23 = select i1 %tmp5, i32 -720, i32 -80
+  %tmp24 = add i32 %tmp22, %tmp23
+  %tmp25 = select i1 %tmp7, i32 -720, i32 -80
+  %tmp26 = add i32 %tmp24, %tmp25
+  %tmp27 = select i1 %tmp9, i32 -720, i32 -80
+  %tmp28 = add i32 %tmp26, %tmp27
+  %tmp29 = select i1 %tmp11, i32 -720, i32 -80
+  %tmp30 = add i32 %tmp28, %tmp29
+  %tmp31 = select i1 %tmp13, i32 -720, i32 -80
+  %tmp32 = add i32 %tmp30, %tmp31
+  %tmp33 = select i1 %tmp15, i32 -720, i32 -80
+  %tmp34 = add i32 %tmp32, %tmp33
+  br label %for.body
+}
diff --git a/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll b/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll
index 35763953911b..63c6d77954d8 100644
--- a/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll
+++ b/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll
@@ -9,7 +9,7 @@ target datalayout = "e-p:32:32:32-p3:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-
 
 
 ; Simple 3-pair chain with loads and stores
-define void @test1_as_3_3_3(double addrspace(3)* %a, double addrspace(3)* %b, double addrspace(3)* %c) {
+define amdgpu_kernel void @test1_as_3_3_3(double addrspace(3)* %a, double addrspace(3)* %b, double addrspace(3)* %c) {
 ; CHECK-LABEL: @test1_as_3_3_3(
 ; CHECK: load <2 x double>, <2 x double> addrspace(3)*
 ; CHECK: load <2 x double>, <2 x double> addrspace(3)*
@@ -29,7 +29,7 @@ define void @test1_as_3_3_3(double addrspace(3)* %a, double addrspace(3)* %b, do
   ret void
 }
 
-define void @test1_as_3_0_0(double addrspace(3)* %a, double* %b, double* %c) {
+define amdgpu_kernel void @test1_as_3_0_0(double addrspace(3)* %a, double* %b, double* %c) {
 ; CHECK-LABEL: @test1_as_3_0_0(
 ; CHECK: load <2 x double>, <2 x double> addrspace(3)*
 ; CHECK: load <2 x double>, <2 x double>*
@@ -49,7 +49,7 @@ define void @test1_as_3_0_0(double addrspace(3)* %a, double* %b, double* %c) {
   ret void
 }
 
-define void @test1_as_0_0_3(double* %a, double* %b, double addrspace(3)* %c) {
+define amdgpu_kernel void @test1_as_0_0_3(double* %a, double* %b, double addrspace(3)* %c) {
 ; CHECK-LABEL: @test1_as_0_0_3(
 ; CHECK: load <2 x double>, <2 x double>*
 ; CHECK: load <2 x double>, <2 x double>*
diff --git a/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll b/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll
new file mode 100644
index 000000000000..1a32f6590663
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll
@@ -0,0 +1,36 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=systemz-unknown -mcpu=z13 -slp-vectorizer -debug-only=SLP \
+; RUN:   -S -disable-output < %s 2>&1 | FileCheck %s
+;
+; Check that SLP vectorizer gets the right cost difference for a compare
+; node.
+
+; Function Attrs: norecurse nounwind readonly
+define void @fun(i8* nocapture, i32 zeroext) local_unnamed_addr #0 {
+.lr.ph.preheader:
+  br label %.lr.ph
+
+.lr.ph:                                           ; preds = %.lr.ph.preheader, %.lr.ph
+  %2 = phi i32 [ %., %.lr.ph ], [ undef, %.lr.ph.preheader ]
+  %3 = phi i32 [ %.9, %.lr.ph ], [ undef, %.lr.ph.preheader ]
+  %4 = icmp ult i32 %2, %1
+  %5 = select i1 %4, i32 0, i32 %1
+  %. = sub i32 %2, %5
+  %6 = icmp ult i32 %3, %1
+  %7 = select i1 %6, i32 0, i32 %1
+  %.9 = sub i32 %3, %7
+  %8 = zext i32 %. to i64
+  %9 = getelementptr inbounds i8, i8* %0, i64 %8
+  %10 = load i8, i8* %9, align 1
+  %11 = zext i32 %.9 to i64
+  %12 = getelementptr inbounds i8, i8* %0, i64 %11
+  %13 = load i8, i8* %12, align 1
+  %14 = icmp eq i8 %10, %13
+  br i1 %14, label %.lr.ph, label %._crit_edge
+
+._crit_edge:                                      ; preds = %.lr.ph
+  ret void
+
+; CHECK: SLP: Adding cost -1 for bundle that starts with   %4 = icmp ult i32 %2, %1.
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/bitreverse.ll b/test/Transforms/SLPVectorizer/X86/bitreverse.ll
index c6d65bbe6840..749e93b04134 100644
--- a/test/Transforms/SLPVectorizer/X86/bitreverse.ll
+++ b/test/Transforms/SLPVectorizer/X86/bitreverse.ll
@@ -22,29 +22,11 @@ declare i16 @llvm.bitreverse.i16(i16)
 declare  i8 @llvm.bitreverse.i8(i8)
 
 define void @bitreverse_2i64() #0 {
-; SSE-LABEL: @bitreverse_2i64(
-; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD0]])
-; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD1]])
-; SSE-NEXT:    store i64 [[BITREVERSE0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[BITREVERSE1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    ret void
-;
-; AVX-LABEL: @bitreverse_2i64(
-; AVX-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
-; AVX-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
-; AVX-NEXT:    [[BITREVERSE0:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD0]])
-; AVX-NEXT:    [[BITREVERSE1:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD1]])
-; AVX-NEXT:    store i64 [[BITREVERSE0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
-; AVX-NEXT:    store i64 [[BITREVERSE1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
-; AVX-NEXT:    ret void
-;
-; XOP-LABEL: @bitreverse_2i64(
-; XOP-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8
-; XOP-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
-; XOP-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
-; XOP-NEXT:    ret void
+; CHECK-LABEL: @bitreverse_2i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
+; CHECK-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
   %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
@@ -57,40 +39,19 @@ define void @bitreverse_2i64() #0 {
 
 define void @bitreverse_4i64() #0 {
 ; SSE-LABEL: @bitreverse_4i64(
-; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
-; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
-; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
-; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD0]])
-; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD1]])
-; SSE-NEXT:    [[BITREVERSE2:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD2]])
-; SSE-NEXT:    [[BITREVERSE3:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD3]])
-; SSE-NEXT:    store i64 [[BITREVERSE0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
-; SSE-NEXT:    store i64 [[BITREVERSE1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
-; SSE-NEXT:    store i64 [[BITREVERSE2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
-; SSE-NEXT:    store i64 [[BITREVERSE3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP2]])
+; SSE-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4
+; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4
 ; SSE-NEXT:    ret void
 ;
-; AVX1-LABEL: @bitreverse_4i64(
-; AVX1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
-; AVX1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
-; AVX1-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
-; AVX1-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
-; AVX1-NEXT:    [[BITREVERSE0:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD0]])
-; AVX1-NEXT:    [[BITREVERSE1:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD1]])
-; AVX1-NEXT:    [[BITREVERSE2:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD2]])
-; AVX1-NEXT:    [[BITREVERSE3:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD3]])
-; AVX1-NEXT:    store i64 [[BITREVERSE0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
-; AVX1-NEXT:    store i64 [[BITREVERSE1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
-; AVX1-NEXT:    store i64 [[BITREVERSE2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
-; AVX1-NEXT:    store i64 [[BITREVERSE3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
-; AVX1-NEXT:    ret void
-;
-; AVX2-LABEL: @bitreverse_4i64(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
-; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> [[TMP1]])
-; AVX2-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
-; AVX2-NEXT:    ret void
+; AVX-LABEL: @bitreverse_4i64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> [[TMP1]])
+; AVX-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
+; AVX-NEXT:    ret void
 ;
 ; XOP-LABEL: @bitreverse_4i64(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
@@ -114,32 +75,11 @@ define void @bitreverse_4i64() #0 {
 }
 
 define void @bitreverse_4i32() #0 {
-; SSE-LABEL: @bitreverse_4i32(
-; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
-; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
-; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD0]])
-; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD1]])
-; SSE-NEXT:    [[BITREVERSE2:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD2]])
-; SSE-NEXT:    [[BITREVERSE3:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD3]])
-; SSE-NEXT:    store i32 [[BITREVERSE0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; SSE-NEXT:    store i32 [[BITREVERSE1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store i32 [[BITREVERSE2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; SSE-NEXT:    store i32 [[BITREVERSE3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    ret void
-;
-; AVX-LABEL: @bitreverse_4i32(
-; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP1]])
-; AVX-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
-; AVX-NEXT:    ret void
-;
-; XOP-LABEL: @bitreverse_4i32(
-; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
-; XOP-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP1]])
-; XOP-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
-; XOP-NEXT:    ret void
+; CHECK-LABEL: @bitreverse_4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP1]])
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
+; CHECK-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
   %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
@@ -158,30 +98,12 @@ define void @bitreverse_4i32() #0 {
 
 define void @bitreverse_8i32() #0 {
 ; SSE-LABEL: @bitreverse_8i32(
-; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
-; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
-; SSE-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
-; SSE-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
-; SSE-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
-; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD0]])
-; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD1]])
-; SSE-NEXT:    [[BITREVERSE2:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD2]])
-; SSE-NEXT:    [[BITREVERSE3:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD3]])
-; SSE-NEXT:    [[BITREVERSE4:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD4]])
-; SSE-NEXT:    [[BITREVERSE5:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD5]])
-; SSE-NEXT:    [[BITREVERSE6:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD6]])
-; SSE-NEXT:    [[BITREVERSE7:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD7]])
-; SSE-NEXT:    store i32 [[BITREVERSE0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
-; SSE-NEXT:    store i32 [[BITREVERSE1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
-; SSE-NEXT:    store i32 [[BITREVERSE2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
-; SSE-NEXT:    store i32 [[BITREVERSE3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
-; SSE-NEXT:    store i32 [[BITREVERSE4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
-; SSE-NEXT:    store i32 [[BITREVERSE5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
-; SSE-NEXT:    store i32 [[BITREVERSE6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
-; SSE-NEXT:    store i32 [[BITREVERSE7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP2]])
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @bitreverse_8i32(
@@ -224,44 +146,11 @@ define void @bitreverse_8i32() #0 {
 }
 
 define void @bitreverse_8i16() #0 {
-; SSE-LABEL: @bitreverse_8i16(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
-; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
-; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 7), align 2
-; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD0]])
-; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD1]])
-; SSE-NEXT:    [[BITREVERSE2:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD2]])
-; SSE-NEXT:    [[BITREVERSE3:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD3]])
-; SSE-NEXT:    [[BITREVERSE4:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD4]])
-; SSE-NEXT:    [[BITREVERSE5:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD5]])
-; SSE-NEXT:    [[BITREVERSE6:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD6]])
-; SSE-NEXT:    [[BITREVERSE7:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD7]])
-; SSE-NEXT:    store i16 [[BITREVERSE0]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 0), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE1]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 1), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE2]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 2), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE3]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 3), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE4]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE5]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE6]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE7]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
-; SSE-NEXT:    ret void
-;
-; AVX-LABEL: @bitreverse_8i16(
-; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP1]])
-; AVX-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
-; AVX-NEXT:    ret void
-;
-; XOP-LABEL: @bitreverse_8i16(
-; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
-; XOP-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP1]])
-; XOP-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
-; XOP-NEXT:    ret void
+; CHECK-LABEL: @bitreverse_8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP1]])
+; CHECK-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; CHECK-NEXT:    ret void
 ;
   %ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
   %ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
@@ -292,54 +181,12 @@ define void @bitreverse_8i16() #0 {
 
 define void @bitreverse_16i16() #0 {
 ; SSE-LABEL: @bitreverse_16i16(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
-; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
-; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 7), align 2
-; SSE-NEXT:    [[LD8:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8), align 2
-; SSE-NEXT:    [[LD9:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 9), align 2
-; SSE-NEXT:    [[LD10:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 10), align 2
-; SSE-NEXT:    [[LD11:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 11), align 2
-; SSE-NEXT:    [[LD12:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 12), align 2
-; SSE-NEXT:    [[LD13:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 13), align 2
-; SSE-NEXT:    [[LD14:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 14), align 2
-; SSE-NEXT:    [[LD15:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 15), align 2
-; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD0]])
-; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD1]])
-; SSE-NEXT:    [[BITREVERSE2:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD2]])
-; SSE-NEXT:    [[BITREVERSE3:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD3]])
-; SSE-NEXT:    [[BITREVERSE4:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD4]])
-; SSE-NEXT:    [[BITREVERSE5:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD5]])
-; SSE-NEXT:    [[BITREVERSE6:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD6]])
-; SSE-NEXT:    [[BITREVERSE7:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD7]])
-; SSE-NEXT:    [[BITREVERSE8:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD8]])
-; SSE-NEXT:    [[BITREVERSE9:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD9]])
-; SSE-NEXT:    [[BITREVERSE10:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD10]])
-; SSE-NEXT:    [[BITREVERSE11:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD11]])
-; SSE-NEXT:    [[BITREVERSE12:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD12]])
-; SSE-NEXT:    [[BITREVERSE13:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD13]])
-; SSE-NEXT:    [[BITREVERSE14:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD14]])
-; SSE-NEXT:    [[BITREVERSE15:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD15]])
-; SSE-NEXT:    store i16 [[BITREVERSE0]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 0), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE1]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 1), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE2]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 2), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE3]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 3), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE4]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE5]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE6]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE7]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE8]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE9]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 9), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE10]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 10), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE11]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 11), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE12]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 12), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE13]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 13), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE14]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 14), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE15]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 15), align 2
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP2]])
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @bitreverse_16i16(
@@ -406,68 +253,11 @@ define void @bitreverse_16i16() #0 {
 }
 
 define void @bitreverse_16i8() #0 {
-; SSE-LABEL: @bitreverse_16i8(
-; SSE-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1
-; SSE-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1
-; SSE-NEXT:    [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 2), align 1
-; SSE-NEXT:    [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 3), align 1
-; SSE-NEXT:    [[LD4:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 4), align 1
-; SSE-NEXT:    [[LD5:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 5), align 1
-; SSE-NEXT:    [[LD6:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 6), align 1
-; SSE-NEXT:    [[LD7:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 7), align 1
-; SSE-NEXT:    [[LD8:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 8), align 1
-; SSE-NEXT:    [[LD9:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 9), align 1
-; SSE-NEXT:    [[LD10:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1
-; SSE-NEXT:    [[LD11:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1
-; SSE-NEXT:    [[LD12:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1
-; SSE-NEXT:    [[LD13:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1
-; SSE-NEXT:    [[LD14:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1
-; SSE-NEXT:    [[LD15:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1
-; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD0]])
-; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD1]])
-; SSE-NEXT:    [[BITREVERSE2:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD2]])
-; SSE-NEXT:    [[BITREVERSE3:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD3]])
-; SSE-NEXT:    [[BITREVERSE4:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD4]])
-; SSE-NEXT:    [[BITREVERSE5:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD5]])
-; SSE-NEXT:    [[BITREVERSE6:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD6]])
-; SSE-NEXT:    [[BITREVERSE7:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD7]])
-; SSE-NEXT:    [[BITREVERSE8:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD8]])
-; SSE-NEXT:    [[BITREVERSE9:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD9]])
-; SSE-NEXT:    [[BITREVERSE10:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD10]])
-; SSE-NEXT:    [[BITREVERSE11:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD11]])
-; SSE-NEXT:    [[BITREVERSE12:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD12]])
-; SSE-NEXT:    [[BITREVERSE13:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD13]])
-; SSE-NEXT:    [[BITREVERSE14:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD14]])
-; SSE-NEXT:    [[BITREVERSE15:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD15]])
-; SSE-NEXT:    store i8 [[BITREVERSE0]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 0), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE1]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 1), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE2]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 2), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE3]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 3), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE4]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 4), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE5]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 5), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE6]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 6), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 7), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 8), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 9), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
-; SSE-NEXT:    ret void
-;
-; AVX-LABEL: @bitreverse_16i8(
-; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
-; AVX-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
-; AVX-NEXT:    ret void
-;
-; XOP-LABEL: @bitreverse_16i8(
-; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
-; XOP-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
-; XOP-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
-; XOP-NEXT:    ret void
+; CHECK-LABEL: @bitreverse_16i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
+; CHECK-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
 ;
   %ld0  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  0), align 1
   %ld1  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  1), align 1
@@ -521,122 +311,14 @@ define void @bitreverse_16i8() #0 {
 }
 
 define void @bitreverse_32i8() #0 {
-; SSE-LABEL: @bitreverse_32i8(
-; SSE-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1
-; SSE-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1
-; SSE-NEXT:    [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 2), align 1
-; SSE-NEXT:    [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 3), align 1
-; SSE-NEXT:    [[LD4:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 4), align 1
-; SSE-NEXT:    [[LD5:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 5), align 1
-; SSE-NEXT:    [[LD6:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 6), align 1
-; SSE-NEXT:    [[LD7:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 7), align 1
-; SSE-NEXT:    [[LD8:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 8), align 1
-; SSE-NEXT:    [[LD9:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 9), align 1
-; SSE-NEXT:    [[LD10:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1
-; SSE-NEXT:    [[LD11:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1
-; SSE-NEXT:    [[LD12:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1
-; SSE-NEXT:    [[LD13:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1
-; SSE-NEXT:    [[LD14:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1
-; SSE-NEXT:    [[LD15:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1
-; SSE-NEXT:    [[LD16:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16), align 1
-; SSE-NEXT:    [[LD17:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 17), align 1
-; SSE-NEXT:    [[LD18:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 18), align 1
-; SSE-NEXT:    [[LD19:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 19), align 1
-; SSE-NEXT:    [[LD20:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 20), align 1
-; SSE-NEXT:    [[LD21:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 21), align 1
-; SSE-NEXT:    [[LD22:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 22), align 1
-; SSE-NEXT:    [[LD23:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 23), align 1
-; SSE-NEXT:    [[LD24:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 24), align 1
-; SSE-NEXT:    [[LD25:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 25), align 1
-; SSE-NEXT:    [[LD26:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 26), align 1
-; SSE-NEXT:    [[LD27:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 27), align 1
-; SSE-NEXT:    [[LD28:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 28), align 1
-; SSE-NEXT:    [[LD29:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 29), align 1
-; SSE-NEXT:    [[LD30:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 30), align 1
-; SSE-NEXT:    [[LD31:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 31), align 1
-; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD0]])
-; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD1]])
-; SSE-NEXT:    [[BITREVERSE2:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD2]])
-; SSE-NEXT:    [[BITREVERSE3:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD3]])
-; SSE-NEXT:    [[BITREVERSE4:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD4]])
-; SSE-NEXT:    [[BITREVERSE5:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD5]])
-; SSE-NEXT:    [[BITREVERSE6:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD6]])
-; SSE-NEXT:    [[BITREVERSE7:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD7]])
-; SSE-NEXT:    [[BITREVERSE8:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD8]])
-; SSE-NEXT:    [[BITREVERSE9:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD9]])
-; SSE-NEXT:    [[BITREVERSE10:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD10]])
-; SSE-NEXT:    [[BITREVERSE11:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD11]])
-; SSE-NEXT:    [[BITREVERSE12:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD12]])
-; SSE-NEXT:    [[BITREVERSE13:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD13]])
-; SSE-NEXT:    [[BITREVERSE14:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD14]])
-; SSE-NEXT:    [[BITREVERSE15:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD15]])
-; SSE-NEXT:    [[BITREVERSE16:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD16]])
-; SSE-NEXT:    [[BITREVERSE17:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD17]])
-; SSE-NEXT:    [[BITREVERSE18:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD18]])
-; SSE-NEXT:    [[BITREVERSE19:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD19]])
-; SSE-NEXT:    [[BITREVERSE20:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD20]])
-; SSE-NEXT:    [[BITREVERSE21:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD21]])
-; SSE-NEXT:    [[BITREVERSE22:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD22]])
-; SSE-NEXT:    [[BITREVERSE23:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD23]])
-; SSE-NEXT:    [[BITREVERSE24:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD24]])
-; SSE-NEXT:    [[BITREVERSE25:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD25]])
-; SSE-NEXT:    [[BITREVERSE26:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD26]])
-; SSE-NEXT:    [[BITREVERSE27:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD27]])
-; SSE-NEXT:    [[BITREVERSE28:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD28]])
-; SSE-NEXT:    [[BITREVERSE29:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD29]])
-; SSE-NEXT:    [[BITREVERSE30:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD30]])
-; SSE-NEXT:    [[BITREVERSE31:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD31]])
-; SSE-NEXT:    store i8 [[BITREVERSE0]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 0), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE1]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 1), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE2]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 2), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE3]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 3), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE4]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 4), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE5]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 5), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE6]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 6), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 7), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 8), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 9), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE16]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE17]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 17), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE18]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 18), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE19]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 19), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE20]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 20), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE21]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 21), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE22]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 22), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE23]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 23), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE24]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 24), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE25]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 25), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE26]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 26), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE27]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 27), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE28]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 28), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE29]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 29), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE30]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 30), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE31]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 31), align 1
-; SSE-NEXT:    ret void
-;
-; AVX-LABEL: @bitreverse_32i8(
-; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
-; AVX-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP2]])
-; AVX-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
-; AVX-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
-; AVX-NEXT:    ret void
-;
-; XOP-LABEL: @bitreverse_32i8(
-; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
-; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
-; XOP-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
-; XOP-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP2]])
-; XOP-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
-; XOP-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
-; XOP-NEXT:    ret void
+; CHECK-LABEL: @bitreverse_32i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP2]])
+; CHECK-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
 ;
   %ld0  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  0), align 1
   %ld1  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  1), align 1
diff --git a/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll b/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
new file mode 100644
index 000000000000..ba0059ed4e51
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
@@ -0,0 +1,167 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S -o - -mtriple=x86-64-unknown-linux -mcpu=bdver2 -instcombine | FileCheck %s
+
+define <2 x i8> @g(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @g(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <2 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <2 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <2 x i8> undef, i8 [[X0X0]], i32 0
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <2 x i8> [[INS1]], i8 [[Y1Y1]], i32 1
+; CHECK-NEXT:    ret <2 x i8> [[INS2]]
+;
+  %x0 = extractelement <2 x i8> %x, i32 0
+  %y1 = extractelement <2 x i8> %y, i32 1
+  %x0x0 = mul i8 %x0, %x0
+  %y1y1 = mul i8 %y1, %y1
+  %ins1 = insertelement <2 x i8> undef, i8 %x0x0, i32 0
+  %ins2 = insertelement <2 x i8> %ins1, i8 %y1y1, i32 1
+  ret <2 x i8> %ins2
+}
+
+define <4 x i8> @h(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @h(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
+; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT:    [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <4 x i8> undef, i8 [[X0X0]], i32 0
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x i8> [[INS1]], i8 [[X3X3]], i32 1
+; CHECK-NEXT:    [[INS3:%.*]] = insertelement <4 x i8> [[INS2]], i8 [[Y1Y1]], i32 2
+; CHECK-NEXT:    [[INS4:%.*]] = insertelement <4 x i8> [[INS3]], i8 [[Y2Y2]], i32 3
+; CHECK-NEXT:    ret <4 x i8> [[INS4]]
+;
+  %x0 = extractelement <4 x i8> %x, i32 0
+  %x3 = extractelement <4 x i8> %x, i32 3
+  %y1 = extractelement <4 x i8> %y, i32 1
+  %y2 = extractelement <4 x i8> %y, i32 2
+  %x0x0 = mul i8 %x0, %x0
+  %x3x3 = mul i8 %x3, %x3
+  %y1y1 = mul i8 %y1, %y1
+  %y2y2 = mul i8 %y2, %y2
+  %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
+  %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
+  %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
+  %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
+  ret <4 x i8> %ins4
+}
+
+define <4 x i8> @h_undef(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @h_undef(
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 3
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
+; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT:    [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x i8> undef, i8 [[X3X3]], i32 1
+; CHECK-NEXT:    [[INS3:%.*]] = insertelement <4 x i8> [[INS2]], i8 [[Y1Y1]], i32 2
+; CHECK-NEXT:    [[INS4:%.*]] = insertelement <4 x i8> [[INS3]], i8 [[Y2Y2]], i32 3
+; CHECK-NEXT:    ret <4 x i8> [[INS4]]
+;
+  %x0 = extractelement <4 x i8> undef, i32 0
+  %x3 = extractelement <4 x i8> %x, i32 3
+  %y1 = extractelement <4 x i8> %y, i32 1
+  %y2 = extractelement <4 x i8> %y, i32 2
+  %x0x0 = mul i8 %x0, %x0
+  %x3x3 = mul i8 %x3, %x3
+  %y1y1 = mul i8 %y1, %y1
+  %y2y2 = mul i8 %y2, %y2
+  %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
+  %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
+  %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
+  %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
+  ret <4 x i8> %ins4
+}
+
+define i8 @i(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @i(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
+; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT:    [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i8 [[TMP3]]
+;
+  %x0 = extractelement <4 x i8> %x, i32 0
+  %x3 = extractelement <4 x i8> %x, i32 3
+  %y1 = extractelement <4 x i8> %y, i32 1
+  %y2 = extractelement <4 x i8> %y, i32 2
+  %x0x0 = mul i8 %x0, %x0
+  %x3x3 = mul i8 %x3, %x3
+  %y1y1 = mul i8 %y1, %y1
+  %y2y2 = mul i8 %y2, %y2
+  %1 = add i8 %x0x0, %x3x3
+  %2 = add i8 %y1y1, %y2y2
+  %3 = add i8 %1, %2
+  ret i8 %3
+}
+
+define i8 @j(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @j(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
+; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT:    [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i8 [[TMP3]]
+;
+  %x0 = extractelement <4 x i8> %x, i32 0
+  %x3 = extractelement <4 x i8> %x, i32 3
+  %y1 = extractelement <4 x i8> %y, i32 1
+  %y2 = extractelement <4 x i8> %y, i32 2
+  %x0x0 = mul i8 %x0, %x0
+  %x3x3 = mul i8 %x3, %x3
+  %y1y1 = mul i8 %y1, %y1
+  %y2y2 = mul i8 %y2, %y2
+  %1 = add i8 %x0x0, %x3x3
+  %2 = add i8 %y1y1, %y2y2
+  %3 = sdiv i8 %1, %2
+  ret i8 %3
+}
+
+define i8 @k(<4 x i8> %x) {
+; CHECK-LABEL: @k(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
+; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
+; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT:    [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
+; CHECK-NEXT:    [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i8 [[TMP3]]
+;
+  %x0 = extractelement <4 x i8> %x, i32 0
+  %x3 = extractelement <4 x i8> %x, i32 3
+  %x1 = extractelement <4 x i8> %x, i32 1
+  %x2 = extractelement <4 x i8> %x, i32 2
+  %x0x0 = mul i8 %x0, %x0
+  %x3x3 = mul i8 %x3, %x3
+  %x1x1 = mul i8 %x1, %x1
+  %x2x2 = mul i8 %x2, %x2
+  %1 = add i8 %x0x0, %x3x3
+  %2 = add i8 %x1x1, %x2x2
+  %3 = sdiv i8 %1, %2
+  ret i8 %3
+}
diff --git a/test/Transforms/SLPVectorizer/X86/extractelement.ll b/test/Transforms/SLPVectorizer/X86/extractelement.ll
new file mode 100644
index 000000000000..10675f3be8a6
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/extractelement.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-unknown-linux -march=core-avx2 | FileCheck %s
+; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-unknown-linux -march=core-avx2 -slp-threshold=-1 -slp-vectorize-hor-store | FileCheck %s --check-prefix=THRESH1
+; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-unknown-linux -march=core-avx2 -slp-threshold=-2 -slp-vectorize-hor-store | FileCheck %s --check-prefix=THRESH2
+
+@a = global float 0.000000e+00, align 4
+
+define float @f(<2 x float> %x) {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x float> [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret float [[ADD]]
+;
+  %x0 = extractelement <2 x float> %x, i32 0
+  %x1 = extractelement <2 x float> %x, i32 1
+  %x0x0 = fmul float %x0, %x0
+  %x1x1 = fmul float %x1, %x1
+  %add = fadd float %x0x0, %x1x1
+  ret float %add
+}
+
+define float @f_used_out_of_tree(<2 x float> %x) {
+; THRESH2-LABEL: @f_used_out_of_tree(
+; THRESH2-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0
+; THRESH2-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[X]], [[X]]
+; THRESH2-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; THRESH2-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; THRESH2-NEXT:    [[ADD:%.*]] = fadd float [[TMP3]], [[TMP4]]
+; THRESH2-NEXT:    store float [[ADD]], float* @a
+; THRESH2-NEXT:    ret float [[TMP1]]
+;
+  %x0 = extractelement <2 x float> %x, i32 0
+  %x1 = extractelement <2 x float> %x, i32 1
+  %x0x0 = fmul float %x0, %x0
+  %x1x1 = fmul float %x1, %x1
+  %add = fadd float %x0x0, %x1x1
+  store float %add, float* @a
+  ret float %x0
+}
+
+define float @f_used_twice_in_tree(<2 x float> %x) {
+; THRESH1-LABEL: @f_used_twice_in_tree(
+; THRESH1-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1
+; THRESH1-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
+; THRESH1-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
+; THRESH1-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[X]], [[TMP3]]
+; THRESH1-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; THRESH1-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; THRESH1-NEXT:    [[ADD:%.*]] = fadd float [[TMP5]], [[TMP6]]
+; THRESH1-NEXT:    ret float [[ADD]]
+;
+  %x0 = extractelement <2 x float> %x, i32 0
+  %x1 = extractelement <2 x float> %x, i32 1
+  %x0x0 = fmul float %x0, %x1
+  %x1x1 = fmul float %x1, %x1
+  %add = fadd float %x0x0, %x1x1
+  ret float %add
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
index 3f9fffb9b624..73844037f12e 100644
--- a/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 | FileCheck %s
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -slp-threshold=-10 | FileCheck %s --check-prefix=THRESHOLD
 
 @n = external local_unnamed_addr global i32, align 4
 @arr = common local_unnamed_addr global [20 x float] zeroinitializer, align 16
@@ -12,29 +13,55 @@ define float @baz() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
-; CHECK-NEXT:    [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
-; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <2 x float> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0
-; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP8]], [[ADD_1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP7]], i32 1
-; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float [[TMP9]], [[ADD_2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
+; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]]
 ; CHECK-NEXT:    [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV]]
-; CHECK-NEXT:    [[ADD19:%.*]] = fadd fast float [[MUL4]], [[ADD7]]
-; CHECK-NEXT:    [[ADD19_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD19]]
-; CHECK-NEXT:    [[ADD19_2:%.*]] = fadd fast float [[TMP8]], [[ADD19_1]]
-; CHECK-NEXT:    [[ADD19_3:%.*]] = fadd fast float [[TMP9]], [[ADD19_2]]
+; CHECK-NEXT:    [[ADD19:%.*]] = fadd fast float [[TMP4]], [[ADD7]]
+; CHECK-NEXT:    [[ADD19_1:%.*]] = fadd fast float [[TMP5]], [[ADD19]]
+; CHECK-NEXT:    [[ADD19_2:%.*]] = fadd fast float [[TMP9]], [[ADD19_1]]
+; CHECK-NEXT:    [[ADD19_3:%.*]] = fadd fast float [[TMP10]], [[ADD19_2]]
 ; CHECK-NEXT:    store float [[ADD19_3]], float* @res, align 4
 ; CHECK-NEXT:    ret float [[ADD19_3]]
 ;
+; THRESHOLD-LABEL: @baz(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
+; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]]
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]]
+; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
+; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]]
+; THRESHOLD-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
+; THRESHOLD-NEXT:    [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]]
+; THRESHOLD-NEXT:    [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD19:%.*]] = fadd fast float [[TMP4]], [[ADD7]]
+; THRESHOLD-NEXT:    [[ADD19_1:%.*]] = fadd fast float [[TMP5]], [[ADD19]]
+; THRESHOLD-NEXT:    [[ADD19_2:%.*]] = fadd fast float [[TMP9]], [[ADD19_1]]
+; THRESHOLD-NEXT:    [[ADD19_3:%.*]] = fadd fast float [[TMP10]], [[ADD19_2]]
+; THRESHOLD-NEXT:    store float [[ADD19_3]], float* @res, align 4
+; THRESHOLD-NEXT:    ret float [[ADD19_3]]
+;
 entry:
   %0 = load i32, i32* @n, align 4
   %mul = mul nsw i32 %0, 3
@@ -70,42 +97,62 @@ define float @bazz() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
-; CHECK-NEXT:    [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
-; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[MUL4_2:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[MUL4_2]], [[ADD_1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[MUL4_3:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float [[MUL4_3]], [[ADD_2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float undef, [[CONV]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
 ; CHECK-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
 ; CHECK-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
 ; CHECK-NEXT:    [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 4), align 16
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 4), align 16
-; CHECK-NEXT:    [[MUL18:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[ADD19:%.*]] = fadd fast float [[MUL18]], [[ADD7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 5), align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 5), align 4
-; CHECK-NEXT:    [[MUL18_1:%.*]] = fmul fast float [[TMP12]], [[TMP11]]
-; CHECK-NEXT:    [[ADD19_1:%.*]] = fadd fast float [[MUL18_1]], [[ADD19]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 6) to <2 x float>*), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 6) to <2 x float>*), align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast <2 x float> [[TMP14]], [[TMP13]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
-; CHECK-NEXT:    [[ADD19_2:%.*]] = fadd fast float [[TMP16]], [[ADD19_1]]
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
-; CHECK-NEXT:    [[ADD19_3:%.*]] = fadd fast float [[TMP17]], [[ADD19_2]]
-; CHECK-NEXT:    store float [[ADD19_3]], float* @res, align 4
-; CHECK-NEXT:    ret float [[ADD19_3]]
+; CHECK-NEXT:    [[ADD19:%.*]] = fadd fast float undef, [[ADD7]]
+; CHECK-NEXT:    [[ADD19_1:%.*]] = fadd fast float undef, [[ADD19]]
+; CHECK-NEXT:    [[ADD19_2:%.*]] = fadd fast float undef, [[ADD19_1]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP3]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; CHECK-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV6]]
+; CHECK-NEXT:    [[ADD19_3:%.*]] = fadd fast float undef, [[ADD19_2]]
+; CHECK-NEXT:    store float [[BIN_EXTRA5]], float* @res, align 4
+; CHECK-NEXT:    ret float [[BIN_EXTRA5]]
+;
+; THRESHOLD-LABEL: @bazz(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
+; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float undef, [[CONV]]
+; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
+; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; THRESHOLD-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
+; THRESHOLD-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
+; THRESHOLD-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
+; THRESHOLD-NEXT:    [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]]
+; THRESHOLD-NEXT:    [[ADD19:%.*]] = fadd fast float undef, [[ADD7]]
+; THRESHOLD-NEXT:    [[ADD19_1:%.*]] = fadd fast float undef, [[ADD19]]
+; THRESHOLD-NEXT:    [[ADD19_2:%.*]] = fadd fast float undef, [[ADD19_1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP3]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; THRESHOLD-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV6]]
+; THRESHOLD-NEXT:    [[ADD19_3:%.*]] = fadd fast float undef, [[ADD19_2]]
+; THRESHOLD-NEXT:    store float [[BIN_EXTRA5]], float* @res, align 4
+; THRESHOLD-NEXT:    ret float [[BIN_EXTRA5]]
 ;
 entry:
   %0 = load i32, i32* @n, align 4
@@ -155,24 +202,39 @@ define float @bazzz() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
-; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP5]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast float [[CONV]], [[TMP11]]
-; CHECK-NEXT:    store float [[TMP12]], float* @res, align 4
-; CHECK-NEXT:    ret float [[TMP12]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast float undef, undef
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast float undef, [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[CONV]], [[TMP6]]
+; CHECK-NEXT:    store float [[TMP8]], float* @res, align 4
+; CHECK-NEXT:    ret float [[TMP8]]
+;
+; THRESHOLD-LABEL: @bazzz(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = fadd fast float undef, undef
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = fadd fast float undef, [[TMP5]]
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = fmul fast float [[CONV]], [[TMP6]]
+; THRESHOLD-NEXT:    store float [[TMP8]], float* @res, align 4
+; THRESHOLD-NEXT:    ret float [[TMP8]]
 ;
 entry:
   %0 = load i32, i32* @n, align 4
@@ -202,26 +264,42 @@ define i32 @foo() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
-; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP5]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast float [[CONV]], [[TMP11]]
-; CHECK-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP12]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast float undef, undef
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast float undef, [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[CONV]], [[TMP6]]
+; CHECK-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP8]] to i32
 ; CHECK-NEXT:    store i32 [[CONV4]], i32* @n, align 4
 ; CHECK-NEXT:    ret i32 [[CONV4]]
 ;
+; THRESHOLD-LABEL: @foo(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = fadd fast float undef, undef
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = fadd fast float undef, [[TMP5]]
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = fmul fast float [[CONV]], [[TMP6]]
+; THRESHOLD-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP8]] to i32
+; THRESHOLD-NEXT:    store i32 [[CONV4]], i32* @n, align 4
+; THRESHOLD-NEXT:    ret i32 [[CONV4]]
+;
 entry:
   %0 = load i32, i32* @n, align 4
   %conv = sitofp i32 %0 to float
@@ -269,6 +347,28 @@ define float @bar() {
 ; CHECK-NEXT:    store float [[MAX_0_MUL3_2]], float* @res, align 4
 ; CHECK-NEXT:    ret float [[MAX_0_MUL3_2]]
 ;
+; THRESHOLD-LABEL: @bar(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; THRESHOLD-NEXT:    [[CMP4:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
+; THRESHOLD-NEXT:    [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float [[TMP3]], float [[TMP4]]
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
+; THRESHOLD-NEXT:    [[MUL3_1:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
+; THRESHOLD-NEXT:    [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[MUL3_1]]
+; THRESHOLD-NEXT:    [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float [[MUL3_1]]
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
+; THRESHOLD-NEXT:    [[MUL3_2:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
+; THRESHOLD-NEXT:    [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[MUL3_2]]
+; THRESHOLD-NEXT:    [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float [[MUL3_2]]
+; THRESHOLD-NEXT:    store float [[MAX_0_MUL3_2]], float* @res, align 4
+; THRESHOLD-NEXT:    ret float [[MAX_0_MUL3_2]]
+;
 entry:
   %0 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
   %1 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
@@ -292,3 +392,1344 @@ entry:
   ret float %max.0.mul3.2
 }
 
+define float @f(float* nocapture readonly %x) {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
+; CHECK-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 10
+; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 11
+; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
+; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
+; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
+; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <16 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, undef
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
+; CHECK-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
+; CHECK-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
+; CHECK-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
+; CHECK-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
+; CHECK-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
+; CHECK-NEXT:    [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
+; CHECK-NEXT:    [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
+; CHECK-NEXT:    [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
+; CHECK-NEXT:    [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
+; CHECK-NEXT:    [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
+; CHECK-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
+; CHECK-NEXT:    [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
+; CHECK-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
+; CHECK-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
+; CHECK-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
+; CHECK-NEXT:    [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 19
+; CHECK-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 20
+; CHECK-NEXT:    [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 21
+; CHECK-NEXT:    [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 22
+; CHECK-NEXT:    [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 23
+; CHECK-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 24
+; CHECK-NEXT:    [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
+; CHECK-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 26
+; CHECK-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 27
+; CHECK-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
+; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
+; CHECK-NEXT:    [[ARRAYIDX_30:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
+; CHECK-NEXT:    [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31
+; CHECK-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, float* [[X]], i64 32
+; CHECK-NEXT:    [[ARRAYIDX_33:%.*]] = getelementptr inbounds float, float* [[X]], i64 33
+; CHECK-NEXT:    [[ARRAYIDX_34:%.*]] = getelementptr inbounds float, float* [[X]], i64 34
+; CHECK-NEXT:    [[ARRAYIDX_35:%.*]] = getelementptr inbounds float, float* [[X]], i64 35
+; CHECK-NEXT:    [[ARRAYIDX_36:%.*]] = getelementptr inbounds float, float* [[X]], i64 36
+; CHECK-NEXT:    [[ARRAYIDX_37:%.*]] = getelementptr inbounds float, float* [[X]], i64 37
+; CHECK-NEXT:    [[ARRAYIDX_38:%.*]] = getelementptr inbounds float, float* [[X]], i64 38
+; CHECK-NEXT:    [[ARRAYIDX_39:%.*]] = getelementptr inbounds float, float* [[X]], i64 39
+; CHECK-NEXT:    [[ARRAYIDX_40:%.*]] = getelementptr inbounds float, float* [[X]], i64 40
+; CHECK-NEXT:    [[ARRAYIDX_41:%.*]] = getelementptr inbounds float, float* [[X]], i64 41
+; CHECK-NEXT:    [[ARRAYIDX_42:%.*]] = getelementptr inbounds float, float* [[X]], i64 42
+; CHECK-NEXT:    [[ARRAYIDX_43:%.*]] = getelementptr inbounds float, float* [[X]], i64 43
+; CHECK-NEXT:    [[ARRAYIDX_44:%.*]] = getelementptr inbounds float, float* [[X]], i64 44
+; CHECK-NEXT:    [[ARRAYIDX_45:%.*]] = getelementptr inbounds float, float* [[X]], i64 45
+; CHECK-NEXT:    [[ARRAYIDX_46:%.*]] = getelementptr inbounds float, float* [[X]], i64 46
+; CHECK-NEXT:    [[ARRAYIDX_47:%.*]] = getelementptr inbounds float, float* [[X]], i64 47
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <32 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x float>, <32 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
+; CHECK-NEXT:    [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
+; CHECK-NEXT:    [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
+; CHECK-NEXT:    [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
+; CHECK-NEXT:    [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
+; CHECK-NEXT:    [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
+; CHECK-NEXT:    [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
+; CHECK-NEXT:    [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
+; CHECK-NEXT:    [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
+; CHECK-NEXT:    [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
+; CHECK-NEXT:    [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
+; CHECK-NEXT:    [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
+; CHECK-NEXT:    [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
+; CHECK-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
+; CHECK-NEXT:    [[ADD_30:%.*]] = fadd fast float undef, [[ADD_29]]
+; CHECK-NEXT:    [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]]
+; CHECK-NEXT:    [[ADD_32:%.*]] = fadd fast float undef, [[ADD_31]]
+; CHECK-NEXT:    [[ADD_33:%.*]] = fadd fast float undef, [[ADD_32]]
+; CHECK-NEXT:    [[ADD_34:%.*]] = fadd fast float undef, [[ADD_33]]
+; CHECK-NEXT:    [[ADD_35:%.*]] = fadd fast float undef, [[ADD_34]]
+; CHECK-NEXT:    [[ADD_36:%.*]] = fadd fast float undef, [[ADD_35]]
+; CHECK-NEXT:    [[ADD_37:%.*]] = fadd fast float undef, [[ADD_36]]
+; CHECK-NEXT:    [[ADD_38:%.*]] = fadd fast float undef, [[ADD_37]]
+; CHECK-NEXT:    [[ADD_39:%.*]] = fadd fast float undef, [[ADD_38]]
+; CHECK-NEXT:    [[ADD_40:%.*]] = fadd fast float undef, [[ADD_39]]
+; CHECK-NEXT:    [[ADD_41:%.*]] = fadd fast float undef, [[ADD_40]]
+; CHECK-NEXT:    [[ADD_42:%.*]] = fadd fast float undef, [[ADD_41]]
+; CHECK-NEXT:    [[ADD_43:%.*]] = fadd fast float undef, [[ADD_42]]
+; CHECK-NEXT:    [[ADD_44:%.*]] = fadd fast float undef, [[ADD_43]]
+; CHECK-NEXT:    [[ADD_45:%.*]] = fadd fast float undef, [[ADD_44]]
+; CHECK-NEXT:    [[ADD_46:%.*]] = fadd fast float undef, [[ADD_45]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP3]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP3]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <32 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
+; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
+; CHECK-NEXT:    [[RDX_SHUF9:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX10:%.*]] = fadd fast <16 x float> [[TMP1]], [[RDX_SHUF9]]
+; CHECK-NEXT:    [[RDX_SHUF11:%.*]] = shufflevector <16 x float> [[BIN_RDX10]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX12:%.*]] = fadd fast <16 x float> [[BIN_RDX10]], [[RDX_SHUF11]]
+; CHECK-NEXT:    [[RDX_SHUF13:%.*]] = shufflevector <16 x float> [[BIN_RDX12]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX14:%.*]] = fadd fast <16 x float> [[BIN_RDX12]], [[RDX_SHUF13]]
+; CHECK-NEXT:    [[RDX_SHUF15:%.*]] = shufflevector <16 x float> [[BIN_RDX14]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX16:%.*]] = fadd fast <16 x float> [[BIN_RDX14]], [[RDX_SHUF15]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[BIN_RDX16]], i32 0
+; CHECK-NEXT:    [[BIN_RDX17:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[ADD_47:%.*]] = fadd fast float undef, [[ADD_46]]
+; CHECK-NEXT:    ret float [[BIN_RDX17]]
+;
+; THRESHOLD-LABEL: @f(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; THRESHOLD-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; THRESHOLD-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; THRESHOLD-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; THRESHOLD-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; THRESHOLD-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; THRESHOLD-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; THRESHOLD-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
+; THRESHOLD-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
+; THRESHOLD-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 10
+; THRESHOLD-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 11
+; THRESHOLD-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
+; THRESHOLD-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
+; THRESHOLD-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
+; THRESHOLD-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <16 x float>*
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[TMP0]], align 4
+; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, undef
+; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; THRESHOLD-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
+; THRESHOLD-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
+; THRESHOLD-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
+; THRESHOLD-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
+; THRESHOLD-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
+; THRESHOLD-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
+; THRESHOLD-NEXT:    [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
+; THRESHOLD-NEXT:    [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
+; THRESHOLD-NEXT:    [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
+; THRESHOLD-NEXT:    [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
+; THRESHOLD-NEXT:    [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
+; THRESHOLD-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
+; THRESHOLD-NEXT:    [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
+; THRESHOLD-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
+; THRESHOLD-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
+; THRESHOLD-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
+; THRESHOLD-NEXT:    [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 19
+; THRESHOLD-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 20
+; THRESHOLD-NEXT:    [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 21
+; THRESHOLD-NEXT:    [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 22
+; THRESHOLD-NEXT:    [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 23
+; THRESHOLD-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 24
+; THRESHOLD-NEXT:    [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
+; THRESHOLD-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 26
+; THRESHOLD-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 27
+; THRESHOLD-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
+; THRESHOLD-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
+; THRESHOLD-NEXT:    [[ARRAYIDX_30:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
+; THRESHOLD-NEXT:    [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31
+; THRESHOLD-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, float* [[X]], i64 32
+; THRESHOLD-NEXT:    [[ARRAYIDX_33:%.*]] = getelementptr inbounds float, float* [[X]], i64 33
+; THRESHOLD-NEXT:    [[ARRAYIDX_34:%.*]] = getelementptr inbounds float, float* [[X]], i64 34
+; THRESHOLD-NEXT:    [[ARRAYIDX_35:%.*]] = getelementptr inbounds float, float* [[X]], i64 35
+; THRESHOLD-NEXT:    [[ARRAYIDX_36:%.*]] = getelementptr inbounds float, float* [[X]], i64 36
+; THRESHOLD-NEXT:    [[ARRAYIDX_37:%.*]] = getelementptr inbounds float, float* [[X]], i64 37
+; THRESHOLD-NEXT:    [[ARRAYIDX_38:%.*]] = getelementptr inbounds float, float* [[X]], i64 38
+; THRESHOLD-NEXT:    [[ARRAYIDX_39:%.*]] = getelementptr inbounds float, float* [[X]], i64 39
+; THRESHOLD-NEXT:    [[ARRAYIDX_40:%.*]] = getelementptr inbounds float, float* [[X]], i64 40
+; THRESHOLD-NEXT:    [[ARRAYIDX_41:%.*]] = getelementptr inbounds float, float* [[X]], i64 41
+; THRESHOLD-NEXT:    [[ARRAYIDX_42:%.*]] = getelementptr inbounds float, float* [[X]], i64 42
+; THRESHOLD-NEXT:    [[ARRAYIDX_43:%.*]] = getelementptr inbounds float, float* [[X]], i64 43
+; THRESHOLD-NEXT:    [[ARRAYIDX_44:%.*]] = getelementptr inbounds float, float* [[X]], i64 44
+; THRESHOLD-NEXT:    [[ARRAYIDX_45:%.*]] = getelementptr inbounds float, float* [[X]], i64 45
+; THRESHOLD-NEXT:    [[ARRAYIDX_46:%.*]] = getelementptr inbounds float, float* [[X]], i64 46
+; THRESHOLD-NEXT:    [[ARRAYIDX_47:%.*]] = getelementptr inbounds float, float* [[X]], i64 47
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <32 x float>*
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = load <32 x float>, <32 x float>* [[TMP2]], align 4
+; THRESHOLD-NEXT:    [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
+; THRESHOLD-NEXT:    [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
+; THRESHOLD-NEXT:    [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
+; THRESHOLD-NEXT:    [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
+; THRESHOLD-NEXT:    [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
+; THRESHOLD-NEXT:    [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
+; THRESHOLD-NEXT:    [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
+; THRESHOLD-NEXT:    [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
+; THRESHOLD-NEXT:    [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
+; THRESHOLD-NEXT:    [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
+; THRESHOLD-NEXT:    [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
+; THRESHOLD-NEXT:    [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
+; THRESHOLD-NEXT:    [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
+; THRESHOLD-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
+; THRESHOLD-NEXT:    [[ADD_30:%.*]] = fadd fast float undef, [[ADD_29]]
+; THRESHOLD-NEXT:    [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]]
+; THRESHOLD-NEXT:    [[ADD_32:%.*]] = fadd fast float undef, [[ADD_31]]
+; THRESHOLD-NEXT:    [[ADD_33:%.*]] = fadd fast float undef, [[ADD_32]]
+; THRESHOLD-NEXT:    [[ADD_34:%.*]] = fadd fast float undef, [[ADD_33]]
+; THRESHOLD-NEXT:    [[ADD_35:%.*]] = fadd fast float undef, [[ADD_34]]
+; THRESHOLD-NEXT:    [[ADD_36:%.*]] = fadd fast float undef, [[ADD_35]]
+; THRESHOLD-NEXT:    [[ADD_37:%.*]] = fadd fast float undef, [[ADD_36]]
+; THRESHOLD-NEXT:    [[ADD_38:%.*]] = fadd fast float undef, [[ADD_37]]
+; THRESHOLD-NEXT:    [[ADD_39:%.*]] = fadd fast float undef, [[ADD_38]]
+; THRESHOLD-NEXT:    [[ADD_40:%.*]] = fadd fast float undef, [[ADD_39]]
+; THRESHOLD-NEXT:    [[ADD_41:%.*]] = fadd fast float undef, [[ADD_40]]
+; THRESHOLD-NEXT:    [[ADD_42:%.*]] = fadd fast float undef, [[ADD_41]]
+; THRESHOLD-NEXT:    [[ADD_43:%.*]] = fadd fast float undef, [[ADD_42]]
+; THRESHOLD-NEXT:    [[ADD_44:%.*]] = fadd fast float undef, [[ADD_43]]
+; THRESHOLD-NEXT:    [[ADD_45:%.*]] = fadd fast float undef, [[ADD_44]]
+; THRESHOLD-NEXT:    [[ADD_46:%.*]] = fadd fast float undef, [[ADD_45]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP3]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP3]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <32 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; THRESHOLD-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
+; THRESHOLD-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
+; THRESHOLD-NEXT:    [[RDX_SHUF9:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX10:%.*]] = fadd fast <16 x float> [[TMP1]], [[RDX_SHUF9]]
+; THRESHOLD-NEXT:    [[RDX_SHUF11:%.*]] = shufflevector <16 x float> [[BIN_RDX10]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX12:%.*]] = fadd fast <16 x float> [[BIN_RDX10]], [[RDX_SHUF11]]
+; THRESHOLD-NEXT:    [[RDX_SHUF13:%.*]] = shufflevector <16 x float> [[BIN_RDX12]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX14:%.*]] = fadd fast <16 x float> [[BIN_RDX12]], [[RDX_SHUF13]]
+; THRESHOLD-NEXT:    [[RDX_SHUF15:%.*]] = shufflevector <16 x float> [[BIN_RDX14]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX16:%.*]] = fadd fast <16 x float> [[BIN_RDX14]], [[RDX_SHUF15]]
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[BIN_RDX16]], i32 0
+; THRESHOLD-NEXT:    [[BIN_RDX17:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
+; THRESHOLD-NEXT:    [[ADD_47:%.*]] = fadd fast float undef, [[ADD_46]]
+; THRESHOLD-NEXT:    ret float [[BIN_RDX17]]
+;
+  entry:
+  %0 = load float, float* %x, align 4
+  %arrayidx.1 = getelementptr inbounds float, float* %x, i64 1
+  %1 = load float, float* %arrayidx.1, align 4
+  %add.1 = fadd fast float %1, %0
+  %arrayidx.2 = getelementptr inbounds float, float* %x, i64 2
+  %2 = load float, float* %arrayidx.2, align 4
+  %add.2 = fadd fast float %2, %add.1
+  %arrayidx.3 = getelementptr inbounds float, float* %x, i64 3
+  %3 = load float, float* %arrayidx.3, align 4
+  %add.3 = fadd fast float %3, %add.2
+  %arrayidx.4 = getelementptr inbounds float, float* %x, i64 4
+  %4 = load float, float* %arrayidx.4, align 4
+  %add.4 = fadd fast float %4, %add.3
+  %arrayidx.5 = getelementptr inbounds float, float* %x, i64 5
+  %5 = load float, float* %arrayidx.5, align 4
+  %add.5 = fadd fast float %5, %add.4
+  %arrayidx.6 = getelementptr inbounds float, float* %x, i64 6
+  %6 = load float, float* %arrayidx.6, align 4
+  %add.6 = fadd fast float %6, %add.5
+  %arrayidx.7 = getelementptr inbounds float, float* %x, i64 7
+  %7 = load float, float* %arrayidx.7, align 4
+  %add.7 = fadd fast float %7, %add.6
+  %arrayidx.8 = getelementptr inbounds float, float* %x, i64 8
+  %8 = load float, float* %arrayidx.8, align 4
+  %add.8 = fadd fast float %8, %add.7
+  %arrayidx.9 = getelementptr inbounds float, float* %x, i64 9
+  %9 = load float, float* %arrayidx.9, align 4
+  %add.9 = fadd fast float %9, %add.8
+  %arrayidx.10 = getelementptr inbounds float, float* %x, i64 10
+  %10 = load float, float* %arrayidx.10, align 4
+  %add.10 = fadd fast float %10, %add.9
+  %arrayidx.11 = getelementptr inbounds float, float* %x, i64 11
+  %11 = load float, float* %arrayidx.11, align 4
+  %add.11 = fadd fast float %11, %add.10
+  %arrayidx.12 = getelementptr inbounds float, float* %x, i64 12
+  %12 = load float, float* %arrayidx.12, align 4
+  %add.12 = fadd fast float %12, %add.11
+  %arrayidx.13 = getelementptr inbounds float, float* %x, i64 13
+  %13 = load float, float* %arrayidx.13, align 4
+  %add.13 = fadd fast float %13, %add.12
+  %arrayidx.14 = getelementptr inbounds float, float* %x, i64 14
+  %14 = load float, float* %arrayidx.14, align 4
+  %add.14 = fadd fast float %14, %add.13
+  %arrayidx.15 = getelementptr inbounds float, float* %x, i64 15
+  %15 = load float, float* %arrayidx.15, align 4
+  %add.15 = fadd fast float %15, %add.14
+  %arrayidx.16 = getelementptr inbounds float, float* %x, i64 16
+  %16 = load float, float* %arrayidx.16, align 4
+  %add.16 = fadd fast float %16, %add.15
+  %arrayidx.17 = getelementptr inbounds float, float* %x, i64 17
+  %17 = load float, float* %arrayidx.17, align 4
+  %add.17 = fadd fast float %17, %add.16
+  %arrayidx.18 = getelementptr inbounds float, float* %x, i64 18
+  %18 = load float, float* %arrayidx.18, align 4
+  %add.18 = fadd fast float %18, %add.17
+  %arrayidx.19 = getelementptr inbounds float, float* %x, i64 19
+  %19 = load float, float* %arrayidx.19, align 4
+  %add.19 = fadd fast float %19, %add.18
+  %arrayidx.20 = getelementptr inbounds float, float* %x, i64 20
+  %20 = load float, float* %arrayidx.20, align 4
+  %add.20 = fadd fast float %20, %add.19
+  %arrayidx.21 = getelementptr inbounds float, float* %x, i64 21
+  %21 = load float, float* %arrayidx.21, align 4
+  %add.21 = fadd fast float %21, %add.20
+  %arrayidx.22 = getelementptr inbounds float, float* %x, i64 22
+  %22 = load float, float* %arrayidx.22, align 4
+  %add.22 = fadd fast float %22, %add.21
+  %arrayidx.23 = getelementptr inbounds float, float* %x, i64 23
+  %23 = load float, float* %arrayidx.23, align 4
+  %add.23 = fadd fast float %23, %add.22
+  %arrayidx.24 = getelementptr inbounds float, float* %x, i64 24
+  %24 = load float, float* %arrayidx.24, align 4
+  %add.24 = fadd fast float %24, %add.23
+  %arrayidx.25 = getelementptr inbounds float, float* %x, i64 25
+  %25 = load float, float* %arrayidx.25, align 4
+  %add.25 = fadd fast float %25, %add.24
+  %arrayidx.26 = getelementptr inbounds float, float* %x, i64 26
+  %26 = load float, float* %arrayidx.26, align 4
+  %add.26 = fadd fast float %26, %add.25
+  %arrayidx.27 = getelementptr inbounds float, float* %x, i64 27
+  %27 = load float, float* %arrayidx.27, align 4
+  %add.27 = fadd fast float %27, %add.26
+  %arrayidx.28 = getelementptr inbounds float, float* %x, i64 28
+  %28 = load float, float* %arrayidx.28, align 4
+  %add.28 = fadd fast float %28, %add.27
+  %arrayidx.29 = getelementptr inbounds float, float* %x, i64 29
+  %29 = load float, float* %arrayidx.29, align 4
+  %add.29 = fadd fast float %29, %add.28
+  %arrayidx.30 = getelementptr inbounds float, float* %x, i64 30
+  %30 = load float, float* %arrayidx.30, align 4
+  %add.30 = fadd fast float %30, %add.29
+  %arrayidx.31 = getelementptr inbounds float, float* %x, i64 31
+  %31 = load float, float* %arrayidx.31, align 4
+  %add.31 = fadd fast float %31, %add.30
+  %arrayidx.32 = getelementptr inbounds float, float* %x, i64 32
+  %32 = load float, float* %arrayidx.32, align 4
+  %add.32 = fadd fast float %32, %add.31
+  %arrayidx.33 = getelementptr inbounds float, float* %x, i64 33
+  %33 = load float, float* %arrayidx.33, align 4
+  %add.33 = fadd fast float %33, %add.32
+  %arrayidx.34 = getelementptr inbounds float, float* %x, i64 34
+  %34 = load float, float* %arrayidx.34, align 4
+  %add.34 = fadd fast float %34, %add.33
+  %arrayidx.35 = getelementptr inbounds float, float* %x, i64 35
+  %35 = load float, float* %arrayidx.35, align 4
+  %add.35 = fadd fast float %35, %add.34
+  %arrayidx.36 = getelementptr inbounds float, float* %x, i64 36
+  %36 = load float, float* %arrayidx.36, align 4
+  %add.36 = fadd fast float %36, %add.35
+  %arrayidx.37 = getelementptr inbounds float, float* %x, i64 37
+  %37 = load float, float* %arrayidx.37, align 4
+  %add.37 = fadd fast float %37, %add.36
+  %arrayidx.38 = getelementptr inbounds float, float* %x, i64 38
+  %38 = load float, float* %arrayidx.38, align 4
+  %add.38 = fadd fast float %38, %add.37
+  %arrayidx.39 = getelementptr inbounds float, float* %x, i64 39
+  %39 = load float, float* %arrayidx.39, align 4
+  %add.39 = fadd fast float %39, %add.38
+  %arrayidx.40 = getelementptr inbounds float, float* %x, i64 40
+  %40 = load float, float* %arrayidx.40, align 4
+  %add.40 = fadd fast float %40, %add.39
+  %arrayidx.41 = getelementptr inbounds float, float* %x, i64 41
+  %41 = load float, float* %arrayidx.41, align 4
+  %add.41 = fadd fast float %41, %add.40
+  %arrayidx.42 = getelementptr inbounds float, float* %x, i64 42
+  %42 = load float, float* %arrayidx.42, align 4
+  %add.42 = fadd fast float %42, %add.41
+  %arrayidx.43 = getelementptr inbounds float, float* %x, i64 43
+  %43 = load float, float* %arrayidx.43, align 4
+  %add.43 = fadd fast float %43, %add.42
+  %arrayidx.44 = getelementptr inbounds float, float* %x, i64 44
+  %44 = load float, float* %arrayidx.44, align 4
+  %add.44 = fadd fast float %44, %add.43
+  %arrayidx.45 = getelementptr inbounds float, float* %x, i64 45
+  %45 = load float, float* %arrayidx.45, align 4
+  %add.45 = fadd fast float %45, %add.44
+  %arrayidx.46 = getelementptr inbounds float, float* %x, i64 46
+  %46 = load float, float* %arrayidx.46, align 4
+  %add.46 = fadd fast float %46, %add.45
+  %arrayidx.47 = getelementptr inbounds float, float* %x, i64 47
+  %47 = load float, float* %arrayidx.47, align 4
+  %add.47 = fadd fast float %47, %add.46
+  ret float %add.47
+}
+
+define float @f1(float* nocapture readonly %x, i32 %a, i32 %b) {
+; CHECK-LABEL: @f1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[REM]] to float
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
+; CHECK-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 10
+; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 11
+; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
+; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
+; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
+; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
+; CHECK-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
+; CHECK-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
+; CHECK-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
+; CHECK-NEXT:    [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 19
+; CHECK-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 20
+; CHECK-NEXT:    [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 21
+; CHECK-NEXT:    [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 22
+; CHECK-NEXT:    [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 23
+; CHECK-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 24
+; CHECK-NEXT:    [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
+; CHECK-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 26
+; CHECK-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 27
+; CHECK-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
+; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
+; CHECK-NEXT:    [[ARRAYIDX_30:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
+; CHECK-NEXT:    [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float undef, [[CONV]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
+; CHECK-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
+; CHECK-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
+; CHECK-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
+; CHECK-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
+; CHECK-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
+; CHECK-NEXT:    [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
+; CHECK-NEXT:    [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
+; CHECK-NEXT:    [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
+; CHECK-NEXT:    [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
+; CHECK-NEXT:    [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
+; CHECK-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
+; CHECK-NEXT:    [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
+; CHECK-NEXT:    [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
+; CHECK-NEXT:    [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
+; CHECK-NEXT:    [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
+; CHECK-NEXT:    [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
+; CHECK-NEXT:    [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
+; CHECK-NEXT:    [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
+; CHECK-NEXT:    [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
+; CHECK-NEXT:    [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
+; CHECK-NEXT:    [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
+; CHECK-NEXT:    [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
+; CHECK-NEXT:    [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
+; CHECK-NEXT:    [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
+; CHECK-NEXT:    [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
+; CHECK-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
+; CHECK-NEXT:    [[ADD_30:%.*]] = fadd fast float undef, [[ADD_29]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP1]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <32 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
+; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
+; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
+; CHECK-NEXT:    [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]]
+; CHECK-NEXT:    ret float [[BIN_EXTRA]]
+;
+; THRESHOLD-LABEL: @f1(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[REM:%.*]] = srem i32 [[A:%.*]], [[B:%.*]]
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[REM]] to float
+; THRESHOLD-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; THRESHOLD-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; THRESHOLD-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; THRESHOLD-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; THRESHOLD-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; THRESHOLD-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; THRESHOLD-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; THRESHOLD-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
+; THRESHOLD-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
+; THRESHOLD-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 10
+; THRESHOLD-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 11
+; THRESHOLD-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
+; THRESHOLD-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
+; THRESHOLD-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
+; THRESHOLD-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
+; THRESHOLD-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
+; THRESHOLD-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
+; THRESHOLD-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
+; THRESHOLD-NEXT:    [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 19
+; THRESHOLD-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 20
+; THRESHOLD-NEXT:    [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 21
+; THRESHOLD-NEXT:    [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 22
+; THRESHOLD-NEXT:    [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 23
+; THRESHOLD-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 24
+; THRESHOLD-NEXT:    [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
+; THRESHOLD-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 26
+; THRESHOLD-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 27
+; THRESHOLD-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
+; THRESHOLD-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
+; THRESHOLD-NEXT:    [[ARRAYIDX_30:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
+; THRESHOLD-NEXT:    [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>*
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4
+; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float undef, [[CONV]]
+; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
+; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; THRESHOLD-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
+; THRESHOLD-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
+; THRESHOLD-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
+; THRESHOLD-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
+; THRESHOLD-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
+; THRESHOLD-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
+; THRESHOLD-NEXT:    [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
+; THRESHOLD-NEXT:    [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
+; THRESHOLD-NEXT:    [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
+; THRESHOLD-NEXT:    [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
+; THRESHOLD-NEXT:    [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
+; THRESHOLD-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
+; THRESHOLD-NEXT:    [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
+; THRESHOLD-NEXT:    [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
+; THRESHOLD-NEXT:    [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
+; THRESHOLD-NEXT:    [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
+; THRESHOLD-NEXT:    [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
+; THRESHOLD-NEXT:    [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
+; THRESHOLD-NEXT:    [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
+; THRESHOLD-NEXT:    [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
+; THRESHOLD-NEXT:    [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
+; THRESHOLD-NEXT:    [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
+; THRESHOLD-NEXT:    [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
+; THRESHOLD-NEXT:    [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
+; THRESHOLD-NEXT:    [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
+; THRESHOLD-NEXT:    [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
+; THRESHOLD-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
+; THRESHOLD-NEXT:    [[ADD_30:%.*]] = fadd fast float undef, [[ADD_29]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP1]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP1]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <32 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; THRESHOLD-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
+; THRESHOLD-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
+; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]]
+; THRESHOLD-NEXT:    ret float [[BIN_EXTRA]]
+;
+  entry:
+  %rem = srem i32 %a, %b
+  %conv = sitofp i32 %rem to float
+  %0 = load float, float* %x, align 4
+  %add = fadd fast float %0, %conv
+  %arrayidx.1 = getelementptr inbounds float, float* %x, i64 1
+  %1 = load float, float* %arrayidx.1, align 4
+  %add.1 = fadd fast float %1, %add
+  %arrayidx.2 = getelementptr inbounds float, float* %x, i64 2
+  %2 = load float, float* %arrayidx.2, align 4
+  %add.2 = fadd fast float %2, %add.1
+  %arrayidx.3 = getelementptr inbounds float, float* %x, i64 3
+  %3 = load float, float* %arrayidx.3, align 4
+  %add.3 = fadd fast float %3, %add.2
+  %arrayidx.4 = getelementptr inbounds float, float* %x, i64 4
+  %4 = load float, float* %arrayidx.4, align 4
+  %add.4 = fadd fast float %4, %add.3
+  %arrayidx.5 = getelementptr inbounds float, float* %x, i64 5
+  %5 = load float, float* %arrayidx.5, align 4
+  %add.5 = fadd fast float %5, %add.4
+  %arrayidx.6 = getelementptr inbounds float, float* %x, i64 6
+  %6 = load float, float* %arrayidx.6, align 4
+  %add.6 = fadd fast float %6, %add.5
+  %arrayidx.7 = getelementptr inbounds float, float* %x, i64 7
+  %7 = load float, float* %arrayidx.7, align 4
+  %add.7 = fadd fast float %7, %add.6
+  %arrayidx.8 = getelementptr inbounds float, float* %x, i64 8
+  %8 = load float, float* %arrayidx.8, align 4
+  %add.8 = fadd fast float %8, %add.7
+  %arrayidx.9 = getelementptr inbounds float, float* %x, i64 9
+  %9 = load float, float* %arrayidx.9, align 4
+  %add.9 = fadd fast float %9, %add.8
+  %arrayidx.10 = getelementptr inbounds float, float* %x, i64 10
+  %10 = load float, float* %arrayidx.10, align 4
+  %add.10 = fadd fast float %10, %add.9
+  %arrayidx.11 = getelementptr inbounds float, float* %x, i64 11
+  %11 = load float, float* %arrayidx.11, align 4
+  %add.11 = fadd fast float %11, %add.10
+  %arrayidx.12 = getelementptr inbounds float, float* %x, i64 12
+  %12 = load float, float* %arrayidx.12, align 4
+  %add.12 = fadd fast float %12, %add.11
+  %arrayidx.13 = getelementptr inbounds float, float* %x, i64 13
+  %13 = load float, float* %arrayidx.13, align 4
+  %add.13 = fadd fast float %13, %add.12
+  %arrayidx.14 = getelementptr inbounds float, float* %x, i64 14
+  %14 = load float, float* %arrayidx.14, align 4
+  %add.14 = fadd fast float %14, %add.13
+  %arrayidx.15 = getelementptr inbounds float, float* %x, i64 15
+  %15 = load float, float* %arrayidx.15, align 4
+  %add.15 = fadd fast float %15, %add.14
+  %arrayidx.16 = getelementptr inbounds float, float* %x, i64 16
+  %16 = load float, float* %arrayidx.16, align 4
+  %add.16 = fadd fast float %16, %add.15
+  %arrayidx.17 = getelementptr inbounds float, float* %x, i64 17
+  %17 = load float, float* %arrayidx.17, align 4
+  %add.17 = fadd fast float %17, %add.16
+  %arrayidx.18 = getelementptr inbounds float, float* %x, i64 18
+  %18 = load float, float* %arrayidx.18, align 4
+  %add.18 = fadd fast float %18, %add.17
+  %arrayidx.19 = getelementptr inbounds float, float* %x, i64 19
+  %19 = load float, float* %arrayidx.19, align 4
+  %add.19 = fadd fast float %19, %add.18
+  %arrayidx.20 = getelementptr inbounds float, float* %x, i64 20
+  %20 = load float, float* %arrayidx.20, align 4
+  %add.20 = fadd fast float %20, %add.19
+  %arrayidx.21 = getelementptr inbounds float, float* %x, i64 21
+  %21 = load float, float* %arrayidx.21, align 4
+  %add.21 = fadd fast float %21, %add.20
+  %arrayidx.22 = getelementptr inbounds float, float* %x, i64 22
+  %22 = load float, float* %arrayidx.22, align 4
+  %add.22 = fadd fast float %22, %add.21
+  %arrayidx.23 = getelementptr inbounds float, float* %x, i64 23
+  %23 = load float, float* %arrayidx.23, align 4
+  %add.23 = fadd fast float %23, %add.22
+  %arrayidx.24 = getelementptr inbounds float, float* %x, i64 24
+  %24 = load float, float* %arrayidx.24, align 4
+  %add.24 = fadd fast float %24, %add.23
+  %arrayidx.25 = getelementptr inbounds float, float* %x, i64 25
+  %25 = load float, float* %arrayidx.25, align 4
+  %add.25 = fadd fast float %25, %add.24
+  %arrayidx.26 = getelementptr inbounds float, float* %x, i64 26
+  %26 = load float, float* %arrayidx.26, align 4
+  %add.26 = fadd fast float %26, %add.25
+  %arrayidx.27 = getelementptr inbounds float, float* %x, i64 27
+  %27 = load float, float* %arrayidx.27, align 4
+  %add.27 = fadd fast float %27, %add.26
+  %arrayidx.28 = getelementptr inbounds float, float* %x, i64 28
+  %28 = load float, float* %arrayidx.28, align 4
+  %add.28 = fadd fast float %28, %add.27
+  %arrayidx.29 = getelementptr inbounds float, float* %x, i64 29
+  %29 = load float, float* %arrayidx.29, align 4
+  %add.29 = fadd fast float %29, %add.28
+  %arrayidx.30 = getelementptr inbounds float, float* %x, i64 30
+  %30 = load float, float* %arrayidx.30, align 4
+  %add.30 = fadd fast float %30, %add.29
+  %arrayidx.31 = getelementptr inbounds float, float* %x, i64 31
+  %31 = load float, float* %arrayidx.31, align 4
+  %add.31 = fadd fast float %31, %add.30
+  ret float %add.31
+}
+
+define float @loadadd31(float* nocapture readonly %x) {
+; CHECK-LABEL: @loadadd31(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_2]] to <4 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
+; CHECK-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
+; CHECK-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
+; CHECK-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 10
+; CHECK-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 11
+; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
+; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
+; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
+; CHECK-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
+; CHECK-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
+; CHECK-NEXT:    [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
+; CHECK-NEXT:    [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
+; CHECK-NEXT:    [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
+; CHECK-NEXT:    [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
+; CHECK-NEXT:    [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
+; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
+; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
+; CHECK-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
+; CHECK-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
+; CHECK-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 19
+; CHECK-NEXT:    [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 20
+; CHECK-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 21
+; CHECK-NEXT:    [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 22
+; CHECK-NEXT:    [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 23
+; CHECK-NEXT:    [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 24
+; CHECK-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
+; CHECK-NEXT:    [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 26
+; CHECK-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 27
+; CHECK-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
+; CHECK-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
+; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
+; CHECK-NEXT:    [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
+; CHECK-NEXT:    [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
+; CHECK-NEXT:    [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
+; CHECK-NEXT:    [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
+; CHECK-NEXT:    [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
+; CHECK-NEXT:    [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
+; CHECK-NEXT:    [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
+; CHECK-NEXT:    [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
+; CHECK-NEXT:    [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
+; CHECK-NEXT:    [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
+; CHECK-NEXT:    [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
+; CHECK-NEXT:    [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
+; CHECK-NEXT:    [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
+; CHECK-NEXT:    [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <16 x float> [[TMP7]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <16 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0
+; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <8 x float> [[TMP5]], [[RDX_SHUF7]]
+; CHECK-NEXT:    [[RDX_SHUF9:%.*]] = shufflevector <8 x float> [[BIN_RDX8]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX10:%.*]] = fadd fast <8 x float> [[BIN_RDX8]], [[RDX_SHUF9]]
+; CHECK-NEXT:    [[RDX_SHUF11:%.*]] = shufflevector <8 x float> [[BIN_RDX10]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX12:%.*]] = fadd fast <8 x float> [[BIN_RDX10]], [[RDX_SHUF11]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[BIN_RDX12]], i32 0
+; CHECK-NEXT:    [[BIN_RDX13:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[RDX_SHUF14:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX15:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF14]]
+; CHECK-NEXT:    [[RDX_SHUF16:%.*]] = shufflevector <4 x float> [[BIN_RDX15]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX17:%.*]] = fadd fast <4 x float> [[BIN_RDX15]], [[RDX_SHUF16]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX17]], i32 0
+; CHECK-NEXT:    [[BIN_RDX18:%.*]] = fadd fast float [[BIN_RDX13]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast float [[BIN_RDX18]], [[TMP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
+; CHECK-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
+; CHECK-NEXT:    ret float [[TMP12]]
+;
+; THRESHOLD-LABEL: @loadadd31(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; THRESHOLD-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4
+; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP1]], [[TMP0]]
+; THRESHOLD-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; THRESHOLD-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; THRESHOLD-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; THRESHOLD-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_2]] to <4 x float>*
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
+; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; THRESHOLD-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
+; THRESHOLD-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
+; THRESHOLD-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
+; THRESHOLD-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; THRESHOLD-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
+; THRESHOLD-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
+; THRESHOLD-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 10
+; THRESHOLD-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 11
+; THRESHOLD-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
+; THRESHOLD-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
+; THRESHOLD-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>*
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4
+; THRESHOLD-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
+; THRESHOLD-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
+; THRESHOLD-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
+; THRESHOLD-NEXT:    [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
+; THRESHOLD-NEXT:    [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
+; THRESHOLD-NEXT:    [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
+; THRESHOLD-NEXT:    [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
+; THRESHOLD-NEXT:    [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
+; THRESHOLD-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
+; THRESHOLD-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
+; THRESHOLD-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
+; THRESHOLD-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
+; THRESHOLD-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 19
+; THRESHOLD-NEXT:    [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 20
+; THRESHOLD-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 21
+; THRESHOLD-NEXT:    [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 22
+; THRESHOLD-NEXT:    [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 23
+; THRESHOLD-NEXT:    [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 24
+; THRESHOLD-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
+; THRESHOLD-NEXT:    [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 26
+; THRESHOLD-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 27
+; THRESHOLD-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
+; THRESHOLD-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
+; THRESHOLD-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>*
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4
+; THRESHOLD-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
+; THRESHOLD-NEXT:    [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
+; THRESHOLD-NEXT:    [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
+; THRESHOLD-NEXT:    [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
+; THRESHOLD-NEXT:    [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
+; THRESHOLD-NEXT:    [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
+; THRESHOLD-NEXT:    [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
+; THRESHOLD-NEXT:    [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
+; THRESHOLD-NEXT:    [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
+; THRESHOLD-NEXT:    [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
+; THRESHOLD-NEXT:    [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
+; THRESHOLD-NEXT:    [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
+; THRESHOLD-NEXT:    [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
+; THRESHOLD-NEXT:    [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
+; THRESHOLD-NEXT:    [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <16 x float> [[TMP7]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <16 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; THRESHOLD-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0
+; THRESHOLD-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <8 x float> [[TMP5]], [[RDX_SHUF7]]
+; THRESHOLD-NEXT:    [[RDX_SHUF9:%.*]] = shufflevector <8 x float> [[BIN_RDX8]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX10:%.*]] = fadd fast <8 x float> [[BIN_RDX8]], [[RDX_SHUF9]]
+; THRESHOLD-NEXT:    [[RDX_SHUF11:%.*]] = shufflevector <8 x float> [[BIN_RDX10]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX12:%.*]] = fadd fast <8 x float> [[BIN_RDX10]], [[RDX_SHUF11]]
+; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[BIN_RDX12]], i32 0
+; THRESHOLD-NEXT:    [[BIN_RDX13:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
+; THRESHOLD-NEXT:    [[RDX_SHUF14:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX15:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF14]]
+; THRESHOLD-NEXT:    [[RDX_SHUF16:%.*]] = shufflevector <4 x float> [[BIN_RDX15]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX17:%.*]] = fadd fast <4 x float> [[BIN_RDX15]], [[RDX_SHUF16]]
+; THRESHOLD-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX17]], i32 0
+; THRESHOLD-NEXT:    [[BIN_RDX18:%.*]] = fadd fast float [[BIN_RDX13]], [[TMP10]]
+; THRESHOLD-NEXT:    [[TMP11:%.*]] = fadd fast float [[BIN_RDX18]], [[TMP1]]
+; THRESHOLD-NEXT:    [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
+; THRESHOLD-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
+; THRESHOLD-NEXT:    ret float [[TMP12]]
+;
+  entry:
+  %arrayidx = getelementptr inbounds float, float* %x, i64 1
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx.1 = getelementptr inbounds float, float* %x, i64 2
+  %1 = load float, float* %arrayidx.1, align 4
+  %add.1 = fadd fast float %1, %0
+  %arrayidx.2 = getelementptr inbounds float, float* %x, i64 3
+  %2 = load float, float* %arrayidx.2, align 4
+  %add.2 = fadd fast float %2, %add.1
+  %arrayidx.3 = getelementptr inbounds float, float* %x, i64 4
+  %3 = load float, float* %arrayidx.3, align 4
+  %add.3 = fadd fast float %3, %add.2
+  %arrayidx.4 = getelementptr inbounds float, float* %x, i64 5
+  %4 = load float, float* %arrayidx.4, align 4
+  %add.4 = fadd fast float %4, %add.3
+  %arrayidx.5 = getelementptr inbounds float, float* %x, i64 6
+  %5 = load float, float* %arrayidx.5, align 4
+  %add.5 = fadd fast float %5, %add.4
+  %arrayidx.6 = getelementptr inbounds float, float* %x, i64 7
+  %6 = load float, float* %arrayidx.6, align 4
+  %add.6 = fadd fast float %6, %add.5
+  %arrayidx.7 = getelementptr inbounds float, float* %x, i64 8
+  %7 = load float, float* %arrayidx.7, align 4
+  %add.7 = fadd fast float %7, %add.6
+  %arrayidx.8 = getelementptr inbounds float, float* %x, i64 9
+  %8 = load float, float* %arrayidx.8, align 4
+  %add.8 = fadd fast float %8, %add.7
+  %arrayidx.9 = getelementptr inbounds float, float* %x, i64 10
+  %9 = load float, float* %arrayidx.9, align 4
+  %add.9 = fadd fast float %9, %add.8
+  %arrayidx.10 = getelementptr inbounds float, float* %x, i64 11
+  %10 = load float, float* %arrayidx.10, align 4
+  %add.10 = fadd fast float %10, %add.9
+  %arrayidx.11 = getelementptr inbounds float, float* %x, i64 12
+  %11 = load float, float* %arrayidx.11, align 4
+  %add.11 = fadd fast float %11, %add.10
+  %arrayidx.12 = getelementptr inbounds float, float* %x, i64 13
+  %12 = load float, float* %arrayidx.12, align 4
+  %add.12 = fadd fast float %12, %add.11
+  %arrayidx.13 = getelementptr inbounds float, float* %x, i64 14
+  %13 = load float, float* %arrayidx.13, align 4
+  %add.13 = fadd fast float %13, %add.12
+  %arrayidx.14 = getelementptr inbounds float, float* %x, i64 15
+  %14 = load float, float* %arrayidx.14, align 4
+  %add.14 = fadd fast float %14, %add.13
+  %arrayidx.15 = getelementptr inbounds float, float* %x, i64 16
+  %15 = load float, float* %arrayidx.15, align 4
+  %add.15 = fadd fast float %15, %add.14
+  %arrayidx.16 = getelementptr inbounds float, float* %x, i64 17
+  %16 = load float, float* %arrayidx.16, align 4
+  %add.16 = fadd fast float %16, %add.15
+  %arrayidx.17 = getelementptr inbounds float, float* %x, i64 18
+  %17 = load float, float* %arrayidx.17, align 4
+  %add.17 = fadd fast float %17, %add.16
+  %arrayidx.18 = getelementptr inbounds float, float* %x, i64 19
+  %18 = load float, float* %arrayidx.18, align 4
+  %add.18 = fadd fast float %18, %add.17
+  %arrayidx.19 = getelementptr inbounds float, float* %x, i64 20
+  %19 = load float, float* %arrayidx.19, align 4
+  %add.19 = fadd fast float %19, %add.18
+  %arrayidx.20 = getelementptr inbounds float, float* %x, i64 21
+  %20 = load float, float* %arrayidx.20, align 4
+  %add.20 = fadd fast float %20, %add.19
+  %arrayidx.21 = getelementptr inbounds float, float* %x, i64 22
+  %21 = load float, float* %arrayidx.21, align 4
+  %add.21 = fadd fast float %21, %add.20
+  %arrayidx.22 = getelementptr inbounds float, float* %x, i64 23
+  %22 = load float, float* %arrayidx.22, align 4
+  %add.22 = fadd fast float %22, %add.21
+  %arrayidx.23 = getelementptr inbounds float, float* %x, i64 24
+  %23 = load float, float* %arrayidx.23, align 4
+  %add.23 = fadd fast float %23, %add.22
+  %arrayidx.24 = getelementptr inbounds float, float* %x, i64 25
+  %24 = load float, float* %arrayidx.24, align 4
+  %add.24 = fadd fast float %24, %add.23
+  %arrayidx.25 = getelementptr inbounds float, float* %x, i64 26
+  %25 = load float, float* %arrayidx.25, align 4
+  %add.25 = fadd fast float %25, %add.24
+  %arrayidx.26 = getelementptr inbounds float, float* %x, i64 27
+  %26 = load float, float* %arrayidx.26, align 4
+  %add.26 = fadd fast float %26, %add.25
+  %arrayidx.27 = getelementptr inbounds float, float* %x, i64 28
+  %27 = load float, float* %arrayidx.27, align 4
+  %add.27 = fadd fast float %27, %add.26
+  %arrayidx.28 = getelementptr inbounds float, float* %x, i64 29
+  %28 = load float, float* %arrayidx.28, align 4
+  %add.28 = fadd fast float %28, %add.27
+  %arrayidx.29 = getelementptr inbounds float, float* %x, i64 30
+  %29 = load float, float* %arrayidx.29, align 4
+  %add.29 = fadd fast float %29, %add.28
+  ret float %add.29
+}
+
+define float @extra_args(float* nocapture readonly %x, i32 %a, i32 %b) {
+; CHECK-LABEL: @extra_args(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
+; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD4]], [[CONV]]
+; CHECK-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD5]]
+; CHECK-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]]
+; CHECK-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
+; CHECK-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD4_3]]
+; CHECK-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]]
+; CHECK-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
+; CHECK-NEXT:    ret float [[BIN_EXTRA5]]
+;
+; THRESHOLD-LABEL: @extra_args(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
+; THRESHOLD-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; THRESHOLD-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; THRESHOLD-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; THRESHOLD-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; THRESHOLD-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; THRESHOLD-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; THRESHOLD-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
+; THRESHOLD-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
+; THRESHOLD-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
+; THRESHOLD-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD4]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD5]]
+; THRESHOLD-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]]
+; THRESHOLD-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
+; THRESHOLD-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD4_3]]
+; THRESHOLD-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; THRESHOLD-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
+; THRESHOLD-NEXT:    ret float [[BIN_EXTRA5]]
+;
+  entry:
+  %mul = mul nsw i32 %b, %a
+  %conv = sitofp i32 %mul to float
+  %0 = load float, float* %x, align 4
+  %add = fadd fast float %conv, 3.000000e+00
+  %add1 = fadd fast float %0, %add
+  %arrayidx3 = getelementptr inbounds float, float* %x, i64 1
+  %1 = load float, float* %arrayidx3, align 4
+  %add4 = fadd fast float %1, %add1
+  %add5 = fadd fast float %add4, %conv
+  %arrayidx3.1 = getelementptr inbounds float, float* %x, i64 2
+  %2 = load float, float* %arrayidx3.1, align 4
+  %add4.1 = fadd fast float %2, %add5
+  %arrayidx3.2 = getelementptr inbounds float, float* %x, i64 3
+  %3 = load float, float* %arrayidx3.2, align 4
+  %add4.2 = fadd fast float %3, %add4.1
+  %arrayidx3.3 = getelementptr inbounds float, float* %x, i64 4
+  %4 = load float, float* %arrayidx3.3, align 4
+  %add4.3 = fadd fast float %4, %add4.2
+  %arrayidx3.4 = getelementptr inbounds float, float* %x, i64 5
+  %5 = load float, float* %arrayidx3.4, align 4
+  %add4.4 = fadd fast float %5, %add4.3
+  %arrayidx3.5 = getelementptr inbounds float, float* %x, i64 6
+  %6 = load float, float* %arrayidx3.5, align 4
+  %add4.5 = fadd fast float %6, %add4.4
+  %arrayidx3.6 = getelementptr inbounds float, float* %x, i64 7
+  %7 = load float, float* %arrayidx3.6, align 4
+  %add4.6 = fadd fast float %7, %add4.5
+  ret float %add4.6
+}
+
+define float @extra_args_same_several_times(float* nocapture readonly %x, i32 %a, i32 %b) {
+; CHECK-LABEL: @extra_args_same_several_times(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
+; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
+; CHECK-NEXT:    [[ADD41:%.*]] = fadd fast float [[ADD4]], 5.000000e+00
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD41]], [[CONV]]
+; CHECK-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD5]]
+; CHECK-NEXT:    [[ADD4_11:%.*]] = fadd fast float [[ADD4_1]], 5.000000e+00
+; CHECK-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_11]]
+; CHECK-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
+; CHECK-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD4_3]]
+; CHECK-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], 5.000000e+00
+; CHECK-NEXT:    [[BIN_EXTRA6:%.*]] = fadd fast float [[BIN_EXTRA5]], 5.000000e+00
+; CHECK-NEXT:    [[BIN_EXTRA7:%.*]] = fadd fast float [[BIN_EXTRA6]], [[CONV]]
+; CHECK-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
+; CHECK-NEXT:    ret float [[BIN_EXTRA7]]
+;
+; THRESHOLD-LABEL: @extra_args_same_several_times(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
+; THRESHOLD-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; THRESHOLD-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; THRESHOLD-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; THRESHOLD-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; THRESHOLD-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; THRESHOLD-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; THRESHOLD-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
+; THRESHOLD-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
+; THRESHOLD-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
+; THRESHOLD-NEXT:    [[ADD41:%.*]] = fadd fast float [[ADD4]], 5.000000e+00
+; THRESHOLD-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD41]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD5]]
+; THRESHOLD-NEXT:    [[ADD4_11:%.*]] = fadd fast float [[ADD4_1]], 5.000000e+00
+; THRESHOLD-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_11]]
+; THRESHOLD-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
+; THRESHOLD-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD4_3]]
+; THRESHOLD-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; THRESHOLD-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], 5.000000e+00
+; THRESHOLD-NEXT:    [[BIN_EXTRA6:%.*]] = fadd fast float [[BIN_EXTRA5]], 5.000000e+00
+; THRESHOLD-NEXT:    [[BIN_EXTRA7:%.*]] = fadd fast float [[BIN_EXTRA6]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
+; THRESHOLD-NEXT:    ret float [[BIN_EXTRA7]]
+;
+  entry:
+  %mul = mul nsw i32 %b, %a
+  %conv = sitofp i32 %mul to float
+  %0 = load float, float* %x, align 4
+  %add = fadd fast float %conv, 3.000000e+00
+  %add1 = fadd fast float %0, %add
+  %arrayidx3 = getelementptr inbounds float, float* %x, i64 1
+  %1 = load float, float* %arrayidx3, align 4
+  %add4 = fadd fast float %1, %add1
+  %add41 = fadd fast float %add4, 5.000000e+00
+  %add5 = fadd fast float %add41, %conv
+  %arrayidx3.1 = getelementptr inbounds float, float* %x, i64 2
+  %2 = load float, float* %arrayidx3.1, align 4
+  %add4.1 = fadd fast float %2, %add5
+  %add4.11 = fadd fast float %add4.1, 5.000000e+00
+  %arrayidx3.2 = getelementptr inbounds float, float* %x, i64 3
+  %3 = load float, float* %arrayidx3.2, align 4
+  %add4.2 = fadd fast float %3, %add4.11
+  %arrayidx3.3 = getelementptr inbounds float, float* %x, i64 4
+  %4 = load float, float* %arrayidx3.3, align 4
+  %add4.3 = fadd fast float %4, %add4.2
+  %arrayidx3.4 = getelementptr inbounds float, float* %x, i64 5
+  %5 = load float, float* %arrayidx3.4, align 4
+  %add4.4 = fadd fast float %5, %add4.3
+  %arrayidx3.5 = getelementptr inbounds float, float* %x, i64 6
+  %6 = load float, float* %arrayidx3.5, align 4
+  %add4.5 = fadd fast float %6, %add4.4
+  %arrayidx3.6 = getelementptr inbounds float, float* %x, i64 7
+  %7 = load float, float* %arrayidx3.6, align 4
+  %add4.6 = fadd fast float %7, %add4.5
+  ret float %add4.6
+}
+
+define float @extra_args_no_replace(float* nocapture readonly %x, i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: @extra_args_no_replace(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; CHECK-NEXT:    [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float
+; CHECK-NEXT:    [[ADDC:%.*]] = fadd fast float [[CONVC]], 3.000000e+00
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], [[ADDC]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
+; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
+; CHECK-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD4]]
+; CHECK-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]]
+; CHECK-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD4_3]], [[CONV]]
+; CHECK-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD5]]
+; CHECK-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]]
+; CHECK-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
+; CHECK-NEXT:    ret float [[BIN_EXTRA5]]
+;
+; THRESHOLD-LABEL: @extra_args_no_replace(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; THRESHOLD-NEXT:    [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float
+; THRESHOLD-NEXT:    [[ADDC:%.*]] = fadd fast float [[CONVC]], 3.000000e+00
+; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], [[ADDC]]
+; THRESHOLD-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; THRESHOLD-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; THRESHOLD-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; THRESHOLD-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; THRESHOLD-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; THRESHOLD-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; THRESHOLD-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
+; THRESHOLD-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
+; THRESHOLD-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
+; THRESHOLD-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD4]]
+; THRESHOLD-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]]
+; THRESHOLD-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
+; THRESHOLD-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD4_3]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD5]]
+; THRESHOLD-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; THRESHOLD-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
+; THRESHOLD-NEXT:    ret float [[BIN_EXTRA5]]
+;
+  entry:
+  %mul = mul nsw i32 %b, %a
+  %conv = sitofp i32 %mul to float
+  %0 = load float, float* %x, align 4
+  %convc = sitofp i32 %c to float
+  %addc = fadd fast float %convc, 3.000000e+00
+  %add = fadd fast float %conv, %addc
+  %add1 = fadd fast float %0, %add
+  %arrayidx3 = getelementptr inbounds float, float* %x, i64 1
+  %1 = load float, float* %arrayidx3, align 4
+  %add4 = fadd fast float %1, %add1
+  %arrayidx3.1 = getelementptr inbounds float, float* %x, i64 2
+  %2 = load float, float* %arrayidx3.1, align 4
+  %add4.1 = fadd fast float %2, %add4
+  %arrayidx3.2 = getelementptr inbounds float, float* %x, i64 3
+  %3 = load float, float* %arrayidx3.2, align 4
+  %add4.2 = fadd fast float %3, %add4.1
+  %arrayidx3.3 = getelementptr inbounds float, float* %x, i64 4
+  %4 = load float, float* %arrayidx3.3, align 4
+  %add4.3 = fadd fast float %4, %add4.2
+  %add5 = fadd fast float %add4.3, %conv
+  %arrayidx3.4 = getelementptr inbounds float, float* %x, i64 5
+  %5 = load float, float* %arrayidx3.4, align 4
+  %add4.4 = fadd fast float %5, %add5
+  %arrayidx3.5 = getelementptr inbounds float, float* %x, i64 6
+  %6 = load float, float* %arrayidx3.5, align 4
+  %add4.5 = fadd fast float %6, %add4.4
+  %arrayidx3.6 = getelementptr inbounds float, float* %x, i64 7
+  %7 = load float, float* %arrayidx3.6, align 4
+  %add4.6 = fadd fast float %7, %add4.5
+  ret float %add4.6
+}
+
+define i32 @wobble(i32 %arg, i32 %bar) {
+; CHECK-LABEL: @wobble(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[ARG:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[ARG]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[ARG]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[ARG]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[BAR:%.*]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[BAR]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[BAR]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[BAR]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i32> [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
+; CHECK-NEXT:    [[R1:%.*]] = add nuw i32 [[ARG]], undef
+; CHECK-NEXT:    [[R2:%.*]] = add nsw i32 [[R1]], undef
+; CHECK-NEXT:    [[R3:%.*]] = add nsw i32 [[R2]], undef
+; CHECK-NEXT:    [[R4:%.*]] = add nsw i32 [[R3]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP11]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
+; CHECK-NEXT:    [[BIN_EXTRA3:%.*]] = add nsw i32 [[BIN_EXTRA]], [[TMP9]]
+; CHECK-NEXT:    [[R5:%.*]] = add nsw i32 [[R4]], undef
+; CHECK-NEXT:    ret i32 [[BIN_EXTRA3]]
+;
+; THRESHOLD-LABEL: @wobble(
+; THRESHOLD-NEXT:  bb:
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[ARG:%.*]], i32 0
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[ARG]], i32 1
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[ARG]], i32 2
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[ARG]], i32 3
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[BAR:%.*]], i32 0
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[BAR]], i32 1
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[BAR]], i32 2
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[BAR]], i32 3
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = xor <4 x i32> [[TMP3]], [[TMP7]]
+; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
+; THRESHOLD-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i32> [[TMP8]], zeroinitializer
+; THRESHOLD-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
+; THRESHOLD-NEXT:    [[R1:%.*]] = add nuw i32 [[ARG]], undef
+; THRESHOLD-NEXT:    [[R2:%.*]] = add nsw i32 [[R1]], undef
+; THRESHOLD-NEXT:    [[R3:%.*]] = add nsw i32 [[R2]], undef
+; THRESHOLD-NEXT:    [[R4:%.*]] = add nsw i32 [[R3]], undef
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP11]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
+; THRESHOLD-NEXT:    [[BIN_EXTRA3:%.*]] = add nsw i32 [[BIN_EXTRA]], [[TMP9]]
+; THRESHOLD-NEXT:    [[R5:%.*]] = add nsw i32 [[R4]], undef
+; THRESHOLD-NEXT:    ret i32 [[BIN_EXTRA3]]
+;
+  bb:
+  %x1 = xor i32 %arg, %bar
+  %i1 = icmp eq i32 %x1, 0
+  %s1 = sext i1 %i1 to i32
+  %x2 = xor i32 %arg, %bar
+  %i2 = icmp eq i32 %x2, 0
+  %s2 = sext i1 %i2 to i32
+  %x3 = xor i32 %arg, %bar
+  %i3 = icmp eq i32 %x3, 0
+  %s3 = sext i1 %i3 to i32
+  %x4 = xor i32 %arg, %bar
+  %i4 = icmp eq i32 %x4, 0
+  %s4 = sext i1 %i4 to i32
+  %r1 = add nuw i32 %arg, %s1
+  %r2 = add nsw i32 %r1, %s2
+  %r3 = add nsw i32 %r2, %s3
+  %r4 = add nsw i32 %r3, %s4
+  %r5 = add nsw i32 %r4, %x4
+  ret i32 %r5
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/horizontal.ll b/test/Transforms/SLPVectorizer/X86/horizontal.ll
index f6efd26a4c20..080f850f91cf 100644
--- a/test/Transforms/SLPVectorizer/X86/horizontal.ll
+++ b/test/Transforms/SLPVectorizer/X86/horizontal.ll
@@ -624,9 +624,9 @@ define void @i32_red_example4(i32* %res) {
 ; STORE-LABEL: @i32_red_example4(
 ; STORE:         [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16
 ; STORE:         [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP0]], [[RDX_SHUF]]
+; STORE-NEXT:    [[BIN_RDX:%.*]] = add nsw <4 x i32> [[TMP0]], [[RDX_SHUF]]
 ; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; STORE-NEXT:    [[BIN_RDX2:%.*]] = add nsw <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
 ; STORE-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
 ; STORE:         store i32 [[TMP1]], i32* %res, align 16
 ; STORE-NEXT:    ret void
@@ -647,11 +647,11 @@ define void @i32_red_example8(i32* %res) {
 ; STORE-LABEL: @i32_red_example8(
 ; STORE:         [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
 ; STORE:         [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP0]], [[RDX_SHUF]]
+; STORE-NEXT:    [[BIN_RDX:%.*]] = add nsw <8 x i32> [[TMP0]], [[RDX_SHUF]]
 ; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; STORE-NEXT:    [[BIN_RDX2:%.*]] = add nsw <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
 ; STORE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; STORE-NEXT:    [[BIN_RDX4:%.*]] = add nsw <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; STORE-NEXT:    [[TMP1:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
 ; STORE:         store i32 [[TMP1]], i32* %res, align 16
 ; STORE-NEXT:    ret void
@@ -680,13 +680,13 @@ define void @i32_red_example16(i32* %res) {
 ; STORE-LABEL: @i32_red_example16(
 ; STORE:         [[TMP0:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr_i32 to <16 x i32>*), align 16
 ; STORE:         [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP0]], [[RDX_SHUF]]
+; STORE-NEXT:    [[BIN_RDX:%.*]] = add nsw <16 x i32> [[TMP0]], [[RDX_SHUF]]
 ; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX2:%.*]] = add <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; STORE-NEXT:    [[BIN_RDX2:%.*]] = add nsw <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
 ; STORE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x i32> [[BIN_RDX2]], <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX4:%.*]] = add <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; STORE-NEXT:    [[BIN_RDX4:%.*]] = add nsw <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; STORE-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX6:%.*]] = add <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
+; STORE-NEXT:    [[BIN_RDX6:%.*]] = add nsw <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
 ; STORE-NEXT:    [[TMP1:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0
 ; STORE:         store i32 [[TMP1]], i32* %res, align 16
 ; STORE-NEXT:    ret void
@@ -731,15 +731,15 @@ define void @i32_red_example32(i32* %res) {
 ; STORE-LABEL: @i32_red_example32(
 ; STORE:         [[TMP0:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr_i32 to <32 x i32>*), align 16
 ; STORE:         [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX:%.*]] = add <32 x i32> [[TMP0]], [[RDX_SHUF]]
+; STORE-NEXT:    [[BIN_RDX:%.*]] = add nsw <32 x i32> [[TMP0]], [[RDX_SHUF]]
 ; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX2:%.*]] = add <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; STORE-NEXT:    [[BIN_RDX2:%.*]] = add nsw <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
 ; STORE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x i32> [[BIN_RDX2]], <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX4:%.*]] = add <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; STORE-NEXT:    [[BIN_RDX4:%.*]] = add nsw <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; STORE-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x i32> [[BIN_RDX4]], <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX6:%.*]] = add <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
+; STORE-NEXT:    [[BIN_RDX6:%.*]] = add nsw <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
 ; STORE-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[BIN_RDX6]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX8:%.*]] = add <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]]
+; STORE-NEXT:    [[BIN_RDX8:%.*]] = add nsw <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]]
 ; STORE-NEXT:    [[TMP1:%.*]] = extractelement <32 x i32> [[BIN_RDX8]], i32 0
 ; STORE:         store i32 [[TMP1]], i32* %res, align 16
 ; STORE-NEXT:    ret void
@@ -812,3 +812,98 @@ entry:
   ret void
 }
 
+declare i32 @foobar(i32)
+
+define void @i32_red_call(i32 %val) {
+; CHECK-LABEL: @i32_red_call(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
+; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
+; CHECK-NEXT:    [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
+; CHECK-NEXT:    [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
+; CHECK-NEXT:    [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]]
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @foobar(i32 [[ADD_6]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
+  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
+  %add = add nsw i32 %1, %0
+  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
+  %add.1 = add nsw i32 %2, %add
+  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
+  %add.2 = add nsw i32 %3, %add.1
+  %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
+  %add.3 = add nsw i32 %4, %add.2
+  %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
+  %add.4 = add nsw i32 %5, %add.3
+  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
+  %add.5 = add nsw i32 %6, %add.4
+  %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
+  %add.6 = add nsw i32 %7, %add.5
+  %res = call i32 @foobar(i32 %add.6)
+  ret void
+}
+
+define void @i32_red_invoke(i32 %val) personality i32 (...)* @__gxx_personality_v0 {
+; CHECK-LABEL: @i32_red_invoke(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
+; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
+; CHECK-NEXT:    [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
+; CHECK-NEXT:    [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
+; CHECK-NEXT:    [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]]
+; CHECK-NEXT:    [[RES:%.*]] = invoke i32 @foobar(i32 [[ADD_6]])
+; CHECK-NEXT:    to label [[NORMAL:%.*]] unwind label [[EXCEPTION:%.*]]
+; CHECK:       exception:
+; CHECK-NEXT:    [[CLEANUP:%.*]] = landingpad i8
+; CHECK-NEXT:    cleanup
+; CHECK-NEXT:    br label [[NORMAL]]
+; CHECK:       normal:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
+  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
+  %add = add nsw i32 %1, %0
+  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
+  %add.1 = add nsw i32 %2, %add
+  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
+  %add.2 = add nsw i32 %3, %add.1
+  %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
+  %add.3 = add nsw i32 %4, %add.2
+  %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
+  %add.4 = add nsw i32 %5, %add.3
+  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
+  %add.5 = add nsw i32 %6, %add.4
+  %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
+  %add.6 = add nsw i32 %7, %add.5
+  %res = invoke i32 @foobar(i32 %add.6) to label %normal unwind label %exception
+exception:
+  %cleanup = landingpad i8 cleanup
+  br label %normal
+normal:
+  ret void
+}
+
+declare i32 @__gxx_personality_v0(...)
diff --git a/test/Transforms/SLPVectorizer/X86/reduction_loads.ll b/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
index 76b3b9174a51..47a6a44611d8 100644
--- a/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
+++ b/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
@@ -5,17 +5,17 @@
 define i32 @test(i32* nocapture readonly %p) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* %p, i64 1
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* %p, i64 2
-; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* %p, i64 3
-; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* %p, i64 4
-; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* %p, i64 5
-; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* %p, i64 6
-; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* %p, i64 7
-; CHECK-NEXT:    br label %for.body
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, %entry ], [ %add.7, %for.body ]
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* %p to <8 x i32>*
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>, [[TMP1]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 undef, [[SUM]]
@@ -32,10 +32,11 @@ define i32 @test(i32* nocapture readonly %p) {
 ; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[ADD_7:%.*]] = add i32 [[TMP3]], [[SUM]]
-; CHECK-NEXT:    br i1 true, label %for.end, label %for.body
+; CHECK-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP3]], [[SUM]]
+; CHECK-NEXT:    [[ADD_7:%.*]] = add i32 undef, [[ADD_6]]
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 [[ADD_7]]
+; CHECK-NEXT:    ret i32 [[BIN_EXTRA]]
 ;
 entry:
   %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
diff --git a/test/Transforms/SLPVectorizer/X86/scheduling.ll b/test/Transforms/SLPVectorizer/X86/scheduling.ll
index 5377ee82cf97..c4f521c8963e 100644
--- a/test/Transforms/SLPVectorizer/X86/scheduling.ll
+++ b/test/Transforms/SLPVectorizer/X86/scheduling.ll
@@ -8,11 +8,11 @@ define i32 @foo(i32* nocapture readonly %diff) #0 {
 ; CHECK: [[S1:%.+]] = add nsw <4 x i32>
 ; CHECK: store <4 x i32> [[S1]],
 ; CHECK:         [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[S1]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[S1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add nsw <4 x i32> [[S1]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add nsw <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
-; CHECK-NEXT:    [[ADD52:%.*]] = add nsw i32 [[TMP15]],
+; CHECK:         [[ADD52:%.*]] = add nsw i32 [[TMP15]],
 ; CHECK:          ret i32 [[ADD52]]
 ;
 entry:
diff --git a/test/Transforms/SLPVectorizer/X86/store-jumbled.ll b/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
new file mode 100644
index 000000000000..1b2c76384e0b
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
+
+
+
+define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
+; CHECK-LABEL: @jumbled-load(
+; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
+; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
+; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
+; CHECK-NEXT:    [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4
+; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
+; CHECK-NEXT:    [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4
+; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
+; CHECK-NEXT:    [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4
+; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
+; CHECK-NEXT:    [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4
+; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
+; CHECK-NEXT:    [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4
+; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
+; CHECK-NEXT:    [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[LOAD_1]], [[LOAD_5]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_6]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[LOAD_3]], [[LOAD_7]]
+; CHECK-NEXT:    [[MUL_4:%.*]] = mul i32 [[LOAD_4]], [[LOAD_8]]
+; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
+; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
+; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
+; CHECK-NEXT:    store i32 [[MUL_1]], i32* [[GEP_9]], align 4
+; CHECK-NEXT:    store i32 [[MUL_2]], i32* [[GEP_7]], align 4
+; CHECK-NEXT:    store i32 [[MUL_3]], i32* [[GEP_10]], align 4
+; CHECK-NEXT:    store i32 [[MUL_4]], i32* [[GEP_8]], align 4
+; CHECK-NEXT:    ret i32 undef
+;
+  %in.addr = getelementptr inbounds i32, i32* %in, i64 0
+  %load.1 = load i32, i32* %in.addr, align 4
+  %gep.1 = getelementptr inbounds i32, i32* %in.addr, i64 1
+  %load.2 = load i32, i32* %gep.1, align 4
+  %gep.2 = getelementptr inbounds i32, i32* %in.addr, i64 2
+  %load.3 = load i32, i32* %gep.2, align 4
+  %gep.3 = getelementptr inbounds i32, i32* %in.addr, i64 3
+  %load.4 = load i32, i32* %gep.3, align 4
+  %inn.addr = getelementptr inbounds i32, i32* %inn, i64 0
+  %load.5 = load i32, i32* %inn.addr, align 4
+  %gep.4 = getelementptr inbounds i32, i32* %inn.addr, i64 1
+  %load.6 = load i32, i32* %gep.4, align 4
+  %gep.5 = getelementptr inbounds i32, i32* %inn.addr, i64 2
+  %load.7 = load i32, i32* %gep.5, align 4
+  %gep.6 = getelementptr inbounds i32, i32* %inn.addr, i64 3
+  %load.8 = load i32, i32* %gep.6, align 4
+  %mul.1 = mul i32 %load.1, %load.5
+  %mul.2 = mul i32 %load.2, %load.6
+  %mul.3 = mul i32 %load.3, %load.7
+  %mul.4 = mul i32 %load.4, %load.8
+  %gep.7 = getelementptr inbounds i32, i32* %out, i64 0
+  %gep.8 = getelementptr inbounds i32, i32* %out, i64 1
+  %gep.9 = getelementptr inbounds i32, i32* %out, i64 2
+  %gep.10 = getelementptr inbounds i32, i32* %out, i64 3
+  store i32 %mul.1, i32* %gep.9, align 4
+  store i32 %mul.2, i32* %gep.7, align 4
+  store i32 %mul.3, i32* %gep.10, align 4
+  store i32 %mul.4, i32* %gep.8, align 4
+
+  ret i32 undef
+}
diff --git a/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
new file mode 100644
index 000000000000..2b593b78652f
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -0,0 +1,984 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+
+define void @add0(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @add0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> <i32 1, i32 1, i32 2, i32 3>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %add = add nsw i32 %0, 1
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %add, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %add3 = add nsw i32 %1, 1
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %add3, i32* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %add6 = add nsw i32 %2, 2
+  %incdec.ptr7 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %add6, i32* %incdec.ptr4, align 4
+  %3 = load i32, i32* %incdec.ptr5, align 4
+  %add9 = add nsw i32 %3, 3
+  store i32 %add9, i32* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @add1(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @add1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3
+; CHECK-NEXT:    store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %0, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %add3 = add nsw i32 %1, 1
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %add3, i32* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %add6 = add nsw i32 %2, 2
+  %incdec.ptr7 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %add6, i32* %incdec.ptr4, align 4
+  %3 = load i32, i32* %incdec.ptr5, align 4
+  %add9 = add nsw i32 %3, 3
+  store i32 %add9, i32* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @sub0(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @sub0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SUB8:%.*]] = add nsw i32 [[TMP3]], -3
+; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %sub = add nsw i32 %0, -1
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %sub, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %1, i32* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %sub5 = add nsw i32 %2, -2
+  %incdec.ptr6 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %sub5, i32* %incdec.ptr3, align 4
+  %3 = load i32, i32* %incdec.ptr4, align 4
+  %sub8 = add nsw i32 %3, -3
+  store i32 %sub8, i32* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @sub1(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @sub1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> <i32 4, i32 -1, i32 -2, i32 -3>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %add = add nsw i32 %0, 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %add, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %sub = add nsw i32 %1, -1
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %sub, i32* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %sub5 = add nsw i32 %2, -2
+  %incdec.ptr6 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %sub5, i32* %incdec.ptr3, align 4
+  %3 = load i32, i32* %incdec.ptr4, align 4
+  %sub8 = add nsw i32 %3, -3
+  store i32 %sub8, i32* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @sub2(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @sub2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> <i32 -1, i32 -1, i32 -2, i32 -3>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %sub = add nsw i32 %0, -1
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %sub, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %sub3 = add nsw i32 %1, -1
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %sub3, i32* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %sub6 = add nsw i32 %2, -2
+  %incdec.ptr7 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %sub6, i32* %incdec.ptr4, align 4
+  %3 = load i32, i32* %incdec.ptr5, align 4
+  %sub9 = add nsw i32 %3, -3
+  store i32 %sub9, i32* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @addsub0(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @addsub0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
+; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %sub = add nsw i32 %0, -1
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %sub, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %1, i32* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %sub5 = add nsw i32 %2, -2
+  %incdec.ptr6 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %sub5, i32* %incdec.ptr3, align 4
+  %3 = load i32, i32* %incdec.ptr4, align 4
+  %sub8 = sub nsw i32 %3, -3
+  store i32 %sub8, i32* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @addsub1(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @addsub1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
+; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %sub = add nsw i32 %0, -1
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %sub, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %sub1 = sub nsw i32 %1, -1
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %sub1, i32* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %incdec.ptr6 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %2, i32* %incdec.ptr3, align 4
+  %3 = load i32, i32* %incdec.ptr4, align 4
+  %sub8 = sub nsw i32 %3, -3
+  store i32 %sub8, i32* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @mul(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @mul(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 257
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[TMP1]], -3
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[MUL3]], i32* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
+; CHECK-NEXT:    store i32 [[MUL9]], i32* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %mul = mul nsw i32 %0, 257
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %mul, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %mul3 = mul nsw i32 %1, -3
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %mul3, i32* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %incdec.ptr7 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %2, i32* %incdec.ptr4, align 4
+  %3 = load i32, i32* %incdec.ptr5, align 4
+  %mul9 = mul nsw i32 %3, -9
+  store i32 %mul9, i32* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @shl0(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @shl0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP1]], 1
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[SHL5:%.*]] = shl i32 [[TMP2]], 2
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SHL8:%.*]] = shl i32 [[TMP3]], 3
+; CHECK-NEXT:    store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %0, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %shl = shl i32 %1, 1
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %shl, i32* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %shl5 = shl i32 %2, 2
+  %incdec.ptr6 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %shl5, i32* %incdec.ptr3, align 4
+  %3 = load i32, i32* %incdec.ptr4, align 4
+  %shl8 = shl i32 %3, 3
+  store i32 %shl8, i32* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @shl1(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @shl1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], <i32 7, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %shl = shl i32 %0, 7
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %shl, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %shl3 = shl i32 %1, 1
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %shl3, i32* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %shl6 = shl i32 %2, 2
+  %incdec.ptr7 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %shl6, i32* %incdec.ptr4, align 4
+  %3 = load i32, i32* %incdec.ptr5, align 4
+  %shl9 = shl i32 %3, 3
+  store i32 %shl9, i32* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @add0f(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @add0f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> <float 1.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %add = fadd fast float %0, 1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %add, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %add3 = fadd fast float %1, 1.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %add3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %add6 = fadd fast float %2, 2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %add6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %add9 = fadd fast float %3, 3.000000e+00
+  store float %add9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @add1f(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @add1f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[TMP0]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[ADD3]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
+; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %0, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %add3 = fadd fast float %1, 1.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %add3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %add6 = fadd fast float %2, 2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %add6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %add9 = fadd fast float %3, 3.000000e+00
+  store float %add9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @sub0f(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @sub0f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[ADD]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], -3.000000e+00
+; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %add = fadd fast float %0, -1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %add, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %1, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %add6 = fadd fast float %2, -2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %add6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %add9 = fadd fast float %3, -3.000000e+00
+  store float %add9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @sub1f(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @sub1f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> <float 4.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %add = fadd fast float %0, 4.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %add, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %sub = fadd fast float %1, -1.000000e+00
+  %incdec.ptr3 = getelementptr inbounds float, float* %dst, i64 2
+  store float %sub, float* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %sub5 = fadd fast float %2, -2.000000e+00
+  %incdec.ptr6 = getelementptr inbounds float, float* %dst, i64 3
+  store float %sub5, float* %incdec.ptr3, align 4
+  %3 = load float, float* %incdec.ptr4, align 4
+  %sub8 = fadd fast float %3, -3.000000e+00
+  store float %sub8, float* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @sub2f(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @sub2f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> <float -1.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %sub = fadd fast float %0, -1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %sub, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %sub3 = fadd fast float %1, -1.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %sub3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %sub6 = fadd fast float %2, -2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %sub6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %sub9 = fadd fast float %3, -3.000000e+00
+  store float %sub9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @addsub0f(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @addsub0f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[SUB5]], float* [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
+; CHECK-NEXT:    store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %sub = fadd fast float %0, -1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %sub, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %incdec.ptr3 = getelementptr inbounds float, float* %dst, i64 2
+  store float %1, float* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %sub5 = fadd fast float %2, -2.000000e+00
+  %incdec.ptr6 = getelementptr inbounds float, float* %dst, i64 3
+  store float %sub5, float* %incdec.ptr3, align 4
+  %3 = load float, float* %incdec.ptr4, align 4
+  %sub8 = fsub fast float %3, -3.000000e+00
+  store float %sub8, float* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @addsub1f(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @addsub1f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[SUB1]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
+; CHECK-NEXT:    store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %sub = fadd fast float %0, -1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %sub, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %sub1 = fsub fast float %1, -1.000000e+00
+  %incdec.ptr3 = getelementptr inbounds float, float* %dst, i64 2
+  store float %sub1, float* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %incdec.ptr6 = getelementptr inbounds float, float* %dst, i64 3
+  store float %2, float* %incdec.ptr3, align 4
+  %3 = load float, float* %incdec.ptr4, align 4
+  %sub8 = fsub fast float %3, -3.000000e+00
+  store float %sub8, float* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @mulf(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @mulf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[SUB3]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; CHECK-NEXT:    store float [[SUB9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %sub = fmul fast float %0, 2.570000e+02
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %sub, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %sub3 = fmul fast float %1, -3.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %sub3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %2, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %sub9 = fmul fast float %3, -9.000000e+00
+  store float %sub9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @add0fn(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @add0fn(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> <float 1.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %add = fadd float %0, 1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %add, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %add3 = fadd float %1, 1.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %add3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %add6 = fadd float %2, 2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %add6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %add9 = fadd float %3, 3.000000e+00
+  store float %add9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @add1fn(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @add1fn(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[TMP0]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[ADD3:%.*]] = fadd float [[TMP1]], 1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[ADD3]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = fadd float [[TMP2]], 2.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], 3.000000e+00
+; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %0, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %add3 = fadd float %1, 1.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %add3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %add6 = fadd float %2, 2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %add6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %add9 = fadd float %3, 3.000000e+00
+  store float %add9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @sub0fn(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @sub0fn(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[ADD]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = fadd float [[TMP2]], -2.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], -3.000000e+00
+; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %add = fadd fast float %0, -1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %add, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %1, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %add6 = fadd float %2, -2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %add6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %add9 = fadd float %3, -3.000000e+00
+  store float %add9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @sub1fn(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @sub1fn(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> <float 4.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %add = fadd float %0, 4.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %add, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %sub = fadd float %1, -1.000000e+00
+  %incdec.ptr3 = getelementptr inbounds float, float* %dst, i64 2
+  store float %sub, float* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %sub5 = fadd float %2, -2.000000e+00
+  %incdec.ptr6 = getelementptr inbounds float, float* %dst, i64 3
+  store float %sub5, float* %incdec.ptr3, align 4
+  %3 = load float, float* %incdec.ptr4, align 4
+  %sub8 = fadd float %3, -3.000000e+00
+  store float %sub8, float* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @sub2fn(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @sub2fn(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> <float -1.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %sub = fadd float %0, -1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %sub, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %sub3 = fadd float %1, -1.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %sub3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %sub6 = fadd float %2, -2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %sub6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %sub9 = fadd float %3, -3.000000e+00
+  store float %sub9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @mulfn(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @mulfn(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = fmul float [[TMP0]], 2.570000e+02
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[SUB3:%.*]] = fmul float [[TMP1]], -3.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[SUB3]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; CHECK-NEXT:    store float [[SUB9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %sub = fmul float %0, 2.570000e+02
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %sub, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %sub3 = fmul float %1, -3.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %sub3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %2, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %sub9 = fmul fast float %3, -9.000000e+00
+  store float %sub9, float* %incdec.ptr7, align 4
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/vector.ll b/test/Transforms/SLPVectorizer/X86/vector.ll
index 02a18979c659..e1f3fa50ccdb 100644
--- a/test/Transforms/SLPVectorizer/X86/vector.ll
+++ b/test/Transforms/SLPVectorizer/X86/vector.ll
@@ -1,14 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
 ; Make sure that we are not crashing or changing the code.
-;CHECK: test
-;CHECK: icmp
-;CHECK: ret
 define void @test(<4 x i32> %in, <4 x i32> %in2) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[K:%.*]] = icmp eq <4 x i32> [[IN:%.*]], [[IN2:%.*]]
+; CHECK-NEXT:    ret void
+;
   %k = icmp eq <4 x i32> %in, %in2
   ret void
 }
 
+define i1 @cmpv2f32(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @cmpv2f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <2 x i32> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[Y0:%.*]] = extractelement <2 x i32> [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i32 [[X0]], [[Y0]]
+; CHECK-NEXT:    br i1 [[CMP0]], label [[IF:%.*]], label [[ENDIF:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x i32> [[X]], i32 1
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <2 x i32> [[Y]], i32 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[X1]], [[Y1]]
+; CHECK-NEXT:    br label [[ENDIF]]
+; CHECK:       endif:
+; CHECK-NEXT:    [[AND_OF_CMPS:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[CMP1]], [[IF]] ]
+; CHECK-NEXT:    ret i1 [[AND_OF_CMPS]]
+;
+  entry:
+  %x0 = extractelement <2 x i32> %x, i32 0
+  %y0 = extractelement <2 x i32> %y, i32 0
+  %cmp0 = icmp eq i32 %x0, %y0
+  br i1 %cmp0, label %if, label %endif
+
+  if:
+  %x1 = extractelement <2 x i32> %x, i32 1
+  %y1 = extractelement <2 x i32> %y, i32 1
+  %cmp1 = icmp eq i32 %x1, %y1
+  br label %endif
+
+  endif:
+  %and_of_cmps = phi i1 [ false, %entry ], [ %cmp1, %if ]
+  ret i1 %and_of_cmps
+}
+
diff --git a/test/Transforms/SROA/alloca-address-space.ll b/test/Transforms/SROA/alloca-address-space.ll
new file mode 100644
index 000000000000..6b3b3abbff5f
--- /dev/null
+++ b/test/Transforms/SROA/alloca-address-space.ll
@@ -0,0 +1,84 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64-A2"
+
+declare void @llvm.memcpy.p2i8.p2i8.i32(i8 addrspace(2)* nocapture, i8 addrspace(2)* nocapture readonly, i32, i32, i1)
+declare void @llvm.memcpy.p1i8.p2i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(2)* nocapture readonly, i32, i32, i1)
+declare void @llvm.memcpy.p2i8.p1i8.i32(i8 addrspace(2)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i32, i1)
+declare void @llvm.memcpy.p1i8.p1i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i32, i1)
+
+
+
+; CHECK-LABEL: @test_address_space_1_1(
+; CHECK: load <2 x i64>, <2 x i64> addrspace(1)* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64> addrspace(1)* {{.*}}, align 2
+; CHECK: ret void
+define void @test_address_space_1_1(<2 x i64> addrspace(1)* %a, i16 addrspace(1)* %b) {
+  %aa = alloca <2 x i64>, align 16, addrspace(2)
+  %aptr = bitcast <2 x i64> addrspace(1)* %a to i8 addrspace(1)*
+  %aaptr = bitcast <2 x i64> addrspace(2)* %aa to i8 addrspace(2)*
+  call void @llvm.memcpy.p2i8.p1i8.i32(i8 addrspace(2)* %aaptr, i8 addrspace(1)* %aptr, i32 16, i32 2, i1 false)
+  %bptr = bitcast i16 addrspace(1)* %b to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p2i8.i32(i8 addrspace(1)* %bptr, i8 addrspace(2)* %aaptr, i32 16, i32 2, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @test_address_space_1_0(
+; CHECK: load <2 x i64>, <2 x i64> addrspace(1)* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64> addrspace(2)* {{.*}}, align 2
+; CHECK: ret void
+define void @test_address_space_1_0(<2 x i64> addrspace(1)* %a, i16 addrspace(2)* %b) {
+  %aa = alloca <2 x i64>, align 16, addrspace(2)
+  %aptr = bitcast <2 x i64> addrspace(1)* %a to i8 addrspace(1)*
+  %aaptr = bitcast <2 x i64> addrspace(2)* %aa to i8 addrspace(2)*
+  call void @llvm.memcpy.p2i8.p1i8.i32(i8 addrspace(2)* %aaptr, i8 addrspace(1)* %aptr, i32 16, i32 2, i1 false)
+  %bptr = bitcast i16 addrspace(2)* %b to i8 addrspace(2)*
+  call void @llvm.memcpy.p2i8.p2i8.i32(i8 addrspace(2)* %bptr, i8 addrspace(2)* %aaptr, i32 16, i32 2, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @test_address_space_0_1(
+; CHECK: load <2 x i64>, <2 x i64> addrspace(2)* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64> addrspace(1)* {{.*}}, align 2
+; CHECK: ret void
+define void @test_address_space_0_1(<2 x i64> addrspace(2)* %a, i16 addrspace(1)* %b) {
+  %aa = alloca <2 x i64>, align 16, addrspace(2)
+  %aptr = bitcast <2 x i64> addrspace(2)* %a to i8 addrspace(2)*
+  %aaptr = bitcast <2 x i64> addrspace(2)* %aa to i8 addrspace(2)*
+  call void @llvm.memcpy.p2i8.p2i8.i32(i8 addrspace(2)* %aaptr, i8 addrspace(2)* %aptr, i32 16, i32 2, i1 false)
+  %bptr = bitcast i16 addrspace(1)* %b to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p2i8.i32(i8 addrspace(1)* %bptr, i8 addrspace(2)* %aaptr, i32 16, i32 2, i1 false)
+  ret void
+}
+
+%struct.struct_test_27.0.13 = type { i32, float, i64, i8, [4 x i32] }
+
+; CHECK-LABEL: @copy_struct(
+; CHECK-NOT: memcpy
+define void @copy_struct([5 x i64] %in.coerce) {
+for.end:
+  %in = alloca %struct.struct_test_27.0.13, align 8, addrspace(2)
+  %0 = bitcast %struct.struct_test_27.0.13 addrspace(2)* %in to [5 x i64] addrspace(2)*
+  store [5 x i64] %in.coerce, [5 x i64] addrspace(2)* %0, align 8
+  %scevgep9 = getelementptr %struct.struct_test_27.0.13, %struct.struct_test_27.0.13 addrspace(2)* %in, i32 0, i32 4, i32 0
+  %scevgep910 = bitcast i32 addrspace(2)* %scevgep9 to i8 addrspace(2)*
+  call void @llvm.memcpy.p1i8.p2i8.i32(i8 addrspace(1)* undef, i8 addrspace(2)* %scevgep910, i32 16, i32 4, i1 false)
+  ret void
+}
+
+%union.anon = type { i32* }
+
+@g = common global i32 0, align 4
+@l = common addrspace(3) global i32 0, align 4
+
+; Make sure an illegal bitcast isn't introduced
+; CHECK-LABEL: @pr27557(
+; CHECK: %[[CAST:.*]] = bitcast i32* addrspace(2)* {{.*}} to i32 addrspace(3)* addrspace(2)*
+; CHECK: store i32 addrspace(3)* @l, i32 addrspace(3)* addrspace(2)* %[[CAST]]
+define void @pr27557() {
+  %1 = alloca %union.anon, align 8, addrspace(2)
+  %2 = bitcast %union.anon addrspace(2)* %1 to i32* addrspace(2)*
+  store i32* @g, i32* addrspace(2)* %2, align 8
+  %3 = bitcast %union.anon addrspace(2)* %1 to i32 addrspace(3)* addrspace(2)*
+  store i32 addrspace(3)* @l, i32 addrspace(3)* addrspace(2)* %3, align 8
+  ret void
+}
diff --git a/test/Transforms/SROA/basictest.ll b/test/Transforms/SROA/basictest.ll
index 70096f37be05..aa00e89ea04f 100644
--- a/test/Transforms/SROA/basictest.ll
+++ b/test/Transforms/SROA/basictest.ll
@@ -3,8 +3,8 @@
 
 target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
 define i32 @test0() {
 ; CHECK-LABEL: @test0(
@@ -16,22 +16,22 @@ entry:
   %a2 = alloca float
 
   %a1.i8 = bitcast i32* %a1 to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %a1.i8)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %a1.i8)
 
   store i32 0, i32* %a1
   %v1 = load i32, i32* %a1
 
-  call void @llvm.lifetime.end(i64 4, i8* %a1.i8)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %a1.i8)
 
   %a2.i8 = bitcast float* %a2 to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %a2.i8)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %a2.i8)
 
   store float 0.0, float* %a2
   %v2 = load float , float * %a2
   %v2.int = bitcast float %v2 to i32
   %sum1 = add i32 %v1, %v2.int
 
-  call void @llvm.lifetime.end(i64 4, i8* %a2.i8)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %a2.i8)
 
   ret i32 %sum1
 }
@@ -1057,7 +1057,7 @@ define void @PR14059.1(double* %d) {
 entry:
   %X.sroa.0.i = alloca double, align 8
   %0 = bitcast double* %X.sroa.0.i to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %0)
 
   ; Store to the low 32-bits...
   %X.sroa.0.0.cast2.i = bitcast double* %X.sroa.0.i to i32*
@@ -1084,7 +1084,7 @@ entry:
   %accum.real.i = load double, double* %d, align 8
   %add.r.i = fadd double %accum.real.i, %X.sroa.0.0.load1.i
   store double %add.r.i, double* %d, align 8
-  call void @llvm.lifetime.end(i64 -1, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %0)
   ret void
 }
 
@@ -1652,7 +1652,7 @@ define void @PR25873(%struct.STest* %outData) {
 entry:
   %tmpData = alloca %struct.STest, align 8
   %0 = bitcast %struct.STest* %tmpData to i8*
-  call void @llvm.lifetime.start(i64 16, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %0)
   %x = getelementptr inbounds %struct.STest, %struct.STest* %tmpData, i64 0, i32 0, i32 0
   store float 1.230000e+02, float* %x, align 8
   %y = getelementptr inbounds %struct.STest, %struct.STest* %tmpData, i64 0, i32 0, i32 1
@@ -1664,7 +1664,7 @@ entry:
   store i64 %3, i64* %2, align 8
   %4 = bitcast %struct.STest* %outData to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* %0, i64 16, i32 4, i1 false)
-  call void @llvm.lifetime.end(i64 16, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %0)
   ret void
 }
 
@@ -1677,10 +1677,10 @@ define void @PR27999() unnamed_addr {
 entry-block:
   %0 = alloca [2 x i64], align 8
   %1 = bitcast [2 x i64]* %0 to i8*
-  call void @llvm.lifetime.start(i64 16, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %1)
   %2 = getelementptr inbounds [2 x i64], [2 x i64]* %0, i32 0, i32 1
   %3 = bitcast i64* %2 to i8*
-  call void @llvm.lifetime.end(i64 8, i8* %3)
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* %3)
   ret void
 }
 
@@ -1692,6 +1692,6 @@ bb1:
   %e.7.sroa.6.i = alloca i32, align 1
   %e.7.sroa.6.0.load81.i = load i32, i32* %e.7.sroa.6.i, align 1
   %0 = bitcast i32* %e.7.sroa.6.i to i8*
-  call void @llvm.lifetime.end(i64 2, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 2, i8* %0)
   ret void
 }
diff --git a/test/Transforms/SROA/pr26972.ll b/test/Transforms/SROA/pr26972.ll
index a71058c05b98..3140a805fc4b 100644
--- a/test/Transforms/SROA/pr26972.ll
+++ b/test/Transforms/SROA/pr26972.ll
@@ -10,8 +10,8 @@ target triple = "x86_64-pc-linux"
 define void @fn1() {
   %a = alloca [1073741825 x i32], align 16
   %t0 = bitcast [1073741825 x i32]* %a to i8*
-  call void @llvm.lifetime.end(i64 4294967300, i8* %t0)
+  call void @llvm.lifetime.end.p0i8(i64 4294967300, i8* %t0)
   ret void
 }
 
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
diff --git a/test/Transforms/SROA/preserve-nonnull.ll b/test/Transforms/SROA/preserve-nonnull.ll
new file mode 100644
index 000000000000..fc5ce6a445fa
--- /dev/null
+++ b/test/Transforms/SROA/preserve-nonnull.ll
@@ -0,0 +1,26 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+;
+; Make sure that SROA doesn't lose nonnull metadata
+; on loads from allocas that get optimized out.
+
+; CHECK-LABEL: define float* @yummy_nonnull
+; CHECK: [[RETURN:%(.*)]] = load float*, float** %arg, align 8
+; CHECK: [[ASSUME:%(.*)]] = icmp ne float* {{.*}}[[RETURN]], null
+; CHECK: call void @llvm.assume(i1 {{.*}}[[ASSUME]])
+; CHECK: ret float* {{.*}}[[RETURN]]
+
+define float* @yummy_nonnull(float** %arg) {
+entry-block:
+	%buf = alloca float*
+
+	%_arg_i8 = bitcast float** %arg to i8*
+	%_buf_i8 = bitcast float** %buf to i8*
+	call void @llvm.memcpy.p0i8.p0i8.i64(i8* %_buf_i8, i8* %_arg_i8, i64 8, i32 8, i1 false)
+
+	%ret = load float*, float** %buf, align 8, !nonnull !0
+	ret float* %ret
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+
+!0 = !{}
diff --git a/test/Transforms/SROA/vector-lifetime-intrinsic.ll b/test/Transforms/SROA/vector-lifetime-intrinsic.ll
index 37cf394382ac..abb5cb2ea334 100644
--- a/test/Transforms/SROA/vector-lifetime-intrinsic.ll
+++ b/test/Transforms/SROA/vector-lifetime-intrinsic.ll
@@ -3,10 +3,10 @@
 target datalayout = "e-p:64:32-i64:32-v32:32-n32-S64"
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0
 
 ; CHECK: @wombat
 ; CHECK-NOT: alloca
@@ -15,12 +15,12 @@ define void @wombat(<4 x float> %arg1) {
 bb:
   %tmp = alloca <4 x float>, align 16
   %tmp8 = bitcast <4 x float>* %tmp to i8*
-  call void @llvm.lifetime.start(i64 16, i8* %tmp8)
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %tmp8)
   store <4 x float> %arg1, <4 x float>* %tmp, align 16
   %tmp17 = bitcast <4 x float>* %tmp to <3 x float>*
   %tmp18 = load <3 x float>, <3 x float>* %tmp17
   %tmp20 = bitcast <4 x float>* %tmp to i8*
-  call void @llvm.lifetime.end(i64 16, i8* %tmp20)
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %tmp20)
   call void @wombat3(<3 x float> %tmp18)
   ret void
 }
diff --git a/test/Transforms/SafeStack/AArch64/abi_ssp.ll b/test/Transforms/SafeStack/AArch64/abi_ssp.ll
index 5d584d0a76b9..c78b20aaa01a 100644
--- a/test/Transforms/SafeStack/AArch64/abi_ssp.ll
+++ b/test/Transforms/SafeStack/AArch64/abi_ssp.ll
@@ -1,5 +1,5 @@
-; RUN: opt -safe-stack -S -mtriple=aarch64-linux-android < %s -o - | FileCheck --check-prefix=TLS %s
-
+; RUN: opt -safe-stack -S -mtriple=aarch64-linux-android < %s -o - | FileCheck --check-prefixes=TLS,ANDROID %s
+; RUN: opt -safe-stack -S -mtriple=aarch64-unknown-fuchsia < %s -o - | FileCheck --check-prefixes=TLS,FUCHSIA %s
 
 define void @foo() nounwind uwtable safestack sspreq {
 entry:
@@ -7,7 +7,8 @@ entry:
 ; TLS: call i8* @llvm.thread.pointer()
 
 ; TLS: %[[TP2:.*]] = call i8* @llvm.thread.pointer()
-; TLS: %[[B:.*]] = getelementptr i8, i8* %[[TP2]], i32 40
+; ANDROID: %[[B:.*]] = getelementptr i8, i8* %[[TP2]], i32 40
+; FUCHSIA: %[[B:.*]] = getelementptr i8, i8* %[[TP2]], i32 -16
 ; TLS: %[[C:.*]] = bitcast i8* %[[B]] to i8**
 ; TLS: %[[StackGuard:.*]] = load i8*, i8** %[[C]]
 ; TLS: store i8* %[[StackGuard]], i8** %[[StackGuardSlot:.*]]
diff --git a/test/Transforms/SafeStack/X86/abi_ssp.ll b/test/Transforms/SafeStack/X86/abi_ssp.ll
index ba4ced5b8820..b489e07a8868 100644
--- a/test/Transforms/SafeStack/X86/abi_ssp.ll
+++ b/test/Transforms/SafeStack/X86/abi_ssp.ll
@@ -1,18 +1,25 @@
-; RUN: opt -safe-stack -S -mtriple=i686-pc-linux-gnu < %s -o - | FileCheck --check-prefix=TLS --check-prefix=TLS32 %s
-; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck --check-prefix=TLS --check-prefix=TLS64 %s
-; RUN: opt -safe-stack -S -mtriple=i686-linux-android < %s -o - | FileCheck --check-prefix=TLS --check-prefix=TLS32 %s
-; RUN: opt -safe-stack -S -mtriple=x86_64-linux-android < %s -o - | FileCheck --check-prefix=TLS --check-prefix=TLS64 %s
+; RUN: opt -safe-stack -S -mtriple=i686-pc-linux-gnu < %s -o - | FileCheck --check-prefixes=COMMON,TLS32 %s
+; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck --check-prefixes=COMMON,TLS64 %s
+
+; RUN: opt -safe-stack -S -mtriple=i686-linux-android < %s -o - | FileCheck --check-prefixes=COMMON,GLOBAL32 %s
+; RUN: opt -safe-stack -S -mtriple=i686-linux-android24 < %s -o - | FileCheck --check-prefixes=COMMON,TLS32 %s
+
+; RUN: opt -safe-stack -S -mtriple=x86_64-linux-android < %s -o - | FileCheck --check-prefixes=COMMON,TLS64 %s
+
+; RUN: opt -safe-stack -S -mtriple=x86_64-unknown-fuchsia < %s -o - | FileCheck --check-prefixes=COMMON,FUCHSIA64 %s
 
 define void @foo() safestack sspreq {
 entry:
 ; TLS32: %[[StackGuard:.*]] = load i8*, i8* addrspace(256)* inttoptr (i32 20 to i8* addrspace(256)*)
 ; TLS64: %[[StackGuard:.*]] = load i8*, i8* addrspace(257)* inttoptr (i32 40 to i8* addrspace(257)*)
-; TLS:   store i8* %[[StackGuard]], i8** %[[StackGuardSlot:.*]]
+; FUCHSIA64: %[[StackGuard:.*]] = load i8*, i8* addrspace(257)* inttoptr (i32 16 to i8* addrspace(257)*)
+; GLOBAL32: %[[StackGuard:.*]] = load i8*, i8** @__stack_chk_guard
+; COMMON:   store i8* %[[StackGuard]], i8** %[[StackGuardSlot:.*]]
   %a = alloca i8, align 1
   call void @Capture(i8* %a)
 
-; TLS: %[[A:.*]] = load i8*, i8** %[[StackGuardSlot]]
-; TLS: icmp ne i8* %[[StackGuard]], %[[A]]
+; COMMON: %[[A:.*]] = load i8*, i8** %[[StackGuardSlot]]
+; COMMON: icmp ne i8* %[[StackGuard]], %[[A]]
   ret void
 }
 
diff --git a/test/Transforms/SafeStack/X86/call.ll b/test/Transforms/SafeStack/X86/call.ll
index cbac4ce1bb0d..2d78bb1a6898 100644
--- a/test/Transforms/SafeStack/X86/call.ll
+++ b/test/Transforms/SafeStack/X86/call.ll
@@ -159,8 +159,8 @@ define void @call_lifetime(i32* %p) {
 entry:
   %q = alloca [100 x i8], align 16
   %0 = bitcast [100 x i8]* %q to i8*
-  call void @llvm.lifetime.start(i64 100, i8* %0)
-  call void @llvm.lifetime.end(i64 100, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 100, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 100, i8* %0)
   ret void
 }
 
@@ -174,5 +174,5 @@ declare void @readnone0(i8* nocapture readnone, i8* nocapture)
 
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind argmemonly
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind argmemonly
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind argmemonly
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind argmemonly
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind argmemonly
diff --git a/test/Transforms/SafeStack/X86/coloring-ssp.ll b/test/Transforms/SafeStack/X86/coloring-ssp.ll
index d71babe200df..3b04fdf13fbc 100644
--- a/test/Transforms/SafeStack/X86/coloring-ssp.ll
+++ b/test/Transforms/SafeStack/X86/coloring-ssp.ll
@@ -16,19 +16,19 @@ entry:
   %x0 = bitcast i64* %x to i8*
   %y0 = bitcast i64* %y to i8*
 
-  call void @llvm.lifetime.start(i64 -1, i8* %x0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %x0)
 ; CHECK:  getelementptr i8, i8* %[[USP]], i32 -16
   call void @capture64(i64* %x)
-  call void @llvm.lifetime.end(i64 -1, i8* %x0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %x0)
 
-  call void @llvm.lifetime.start(i64 -1, i8* %y0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %y0)
 ; CHECK:  getelementptr i8, i8* %[[USP]], i32 -16
   call void @capture64(i64* %y)
-  call void @llvm.lifetime.end(i64 -1, i8* %y0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %y0)
 
   ret void
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 declare void @capture64(i64*)
diff --git a/test/Transforms/SafeStack/X86/coloring.ll b/test/Transforms/SafeStack/X86/coloring.ll
index 3ed9ccb43f39..76bdf37dbf4e 100644
--- a/test/Transforms/SafeStack/X86/coloring.ll
+++ b/test/Transforms/SafeStack/X86/coloring.ll
@@ -10,35 +10,35 @@ entry:
   %x1 = alloca i32, align 4
   %x2 = alloca i32, align 4
   %0 = bitcast i32* %x to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0)
 
 ; CHECK:  %[[A1:.*]] = getelementptr i8, i8* %[[USP]], i32 -4
 ; CHECK:  %[[A2:.*]] = bitcast i8* %[[A1]] to i32*
 ; CHECK:  call void @capture(i32* nonnull %[[A2]])
 
   call void @capture(i32* nonnull %x)
-  call void @llvm.lifetime.end(i64 4, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %0)
   %1 = bitcast i32* %x1 to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %1)
 
 ; CHECK:  %[[B1:.*]] = getelementptr i8, i8* %[[USP]], i32 -4
 ; CHECK:  %[[B2:.*]] = bitcast i8* %[[B1]] to i32*
 ; CHECK:  call void @capture(i32* nonnull %[[B2]])
 
   call void @capture(i32* nonnull %x1)
-  call void @llvm.lifetime.end(i64 4, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %1)
   %2 = bitcast i32* %x2 to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %2)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %2)
 
 ; CHECK:  %[[C1:.*]] = getelementptr i8, i8* %[[USP]], i32 -4
 ; CHECK:  %[[C2:.*]] = bitcast i8* %[[C1]] to i32*
 ; CHECK:  call void @capture(i32* nonnull %[[C2]])
 
   call void @capture(i32* nonnull %x2)
-  call void @llvm.lifetime.end(i64 4, i8* %2)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %2)
   ret void
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 declare void @capture(i32*)
diff --git a/test/Transforms/SafeStack/X86/coloring2.ll b/test/Transforms/SafeStack/X86/coloring2.ll
index f3ac6d735c9d..2a8f871945ff 100644
--- a/test/Transforms/SafeStack/X86/coloring2.ll
+++ b/test/Transforms/SafeStack/X86/coloring2.ll
@@ -15,21 +15,21 @@ entry:
   %y0 = bitcast i32* %y to i8*
   %z0 = bitcast i32* %z to i8*
 
-  call void @llvm.lifetime.start(i64 -1, i8* %z0)
-  call void @llvm.lifetime.start(i64 -1, i8* %x0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %z0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %x0)
 
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
   call void @capture32(i32* %x)
-  call void @llvm.lifetime.end(i64 -1, i8* %x0)
-  call void @llvm.lifetime.start(i64 -1, i8* %y0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %x0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %y0)
 
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
   call void @capture32(i32* %y)
-  call void @llvm.lifetime.end(i64 -1, i8* %y0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %y0)
 
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -8
   call void @capture32(i32* %z)
-  call void @llvm.lifetime.end(i64 -1, i8* %z0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %z0)
 
   ret void
 }
@@ -44,11 +44,11 @@ entry:
   %y = alloca i32, align 4
   %x0 = bitcast i32* %x to i8*
 
-  call void @llvm.lifetime.start(i64 -1, i8* %x0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %x0)
 
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
   call void @capture32(i32* %x)
-  call void @llvm.lifetime.end(i64 -1, i8* %x0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %x0)
 
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -8
   call void @capture32(i32* %y)
@@ -70,21 +70,21 @@ entry:
   %y0 = bitcast i32* %y to i8*
   %z0 = bitcast i64* %z to i8*
 
-  call void @llvm.lifetime.start(i64 -1, i8* %x0)
-  call void @llvm.lifetime.start(i64 -1, i8* %y0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %x0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %y0)
 
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
   call void @capture32(i32* %x)
-  call void @llvm.lifetime.end(i64 -1, i8* %x0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %x0)
 
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -8
   call void @capture32(i32* %y)
-  call void @llvm.lifetime.end(i64 -1, i8* %y0)
-  call void @llvm.lifetime.start(i64 -1, i8* %z0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %y0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %z0)
 
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -8
   call void @capture64(i64* %z)
-  call void @llvm.lifetime.end(i64 -1, i8* %z0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %z0)
 
   ret void
 }
@@ -103,9 +103,9 @@ entry:
   %y0 = bitcast i32* %y to i8*
   %z0 = bitcast i64* %z to i8*
 
-  call void @llvm.lifetime.start(i64 -1, i8* %x0)
-  call void @llvm.lifetime.start(i64 -1, i8* %y0)
-  call void @llvm.lifetime.start(i64 -1, i8* %z0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %x0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %y0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %z0)
 
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -16
   call void @capture32(i32* %x)
@@ -116,9 +116,9 @@ entry:
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -8
   call void @capture64(i64* %z)
 
-  call void @llvm.lifetime.end(i64 -1, i8* %x0)
-  call void @llvm.lifetime.end(i64 -1, i8* %y0)
-  call void @llvm.lifetime.end(i64 -1, i8* %z0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %x0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %y0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %z0)
 
   ret void
 }
@@ -156,9 +156,9 @@ entry:
   %z1 = alloca i64, align 8
   %z2 = alloca i64, align 8
   %0 = bitcast i64* %x1 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %0)
   %1 = bitcast i64* %x2 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %1)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -8
 ; CHECK:   call void @capture64(
   call void @capture64(i64* nonnull %x1)
@@ -169,7 +169,7 @@ entry:
 
 if.then:                                          ; preds = %entry
   %2 = bitcast i64* %y to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %2)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %2)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -24
 ; CHECK:   call void @capture64(
   call void @capture64(i64* nonnull %y)
@@ -177,29 +177,29 @@ if.then:                                          ; preds = %entry
 
 if.then3:                                         ; preds = %if.then
   %3 = bitcast i64* %y1 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %3)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %3)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -32
 ; CHECK:   call void @capture64(
   call void @capture64(i64* nonnull %y1)
-  call void @llvm.lifetime.end(i64 -1, i8* %3)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %3)
   br label %if.end
 
 if.else:                                          ; preds = %if.then
   %4 = bitcast i64* %y2 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %4)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %4)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -32
 ; CHECK:   call void @capture64(
   call void @capture64(i64* nonnull %y2)
-  call void @llvm.lifetime.end(i64 -1, i8* %4)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %4)
   br label %if.end
 
 if.end:                                           ; preds = %if.else, %if.then3
-  call void @llvm.lifetime.end(i64 -1, i8* %2)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %2)
   br label %if.end9
 
 if.else4:                                         ; preds = %entry
   %5 = bitcast i64* %z to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %5)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %5)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -24
 ; CHECK:   call void @capture64(
   call void @capture64(i64* nonnull %z)
@@ -207,29 +207,29 @@ if.else4:                                         ; preds = %entry
 
 if.then6:                                         ; preds = %if.else4
   %6 = bitcast i64* %z1 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %6)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %6)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -32
 ; CHECK:   call void @capture64(
   call void @capture64(i64* nonnull %z1)
-  call void @llvm.lifetime.end(i64 -1, i8* %6)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %6)
   br label %if.end8
 
 if.else7:                                         ; preds = %if.else4
   %7 = bitcast i64* %z2 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %7)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %7)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -32
 ; CHECK:   call void @capture64(
   call void @capture64(i64* nonnull %z2)
-  call void @llvm.lifetime.end(i64 -1, i8* %7)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %7)
   br label %if.end8
 
 if.end8:                                          ; preds = %if.else7, %if.then6
-  call void @llvm.lifetime.end(i64 -1, i8* %5)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %5)
   br label %if.end9
 
 if.end9:                                          ; preds = %if.end8, %if.end
-  call void @llvm.lifetime.end(i64 -1, i8* %1)
-  call void @llvm.lifetime.end(i64 -1, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %0)
   ret void
 }
 
@@ -243,21 +243,21 @@ entry:
   %y = alloca i32, align 4
   %x0 = bitcast i32* %x to i8*
   %y0 = bitcast i32* %y to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %x0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %x0)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
 ; CHECK:   call void @capture32(
   call void @capture32(i32* %x)
   br i1 %d, label %bb2, label %bb3
 bb2:
-  call void @llvm.lifetime.start(i64 -1, i8* %y0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %y0)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -8
 ; CHECK:   call void @capture32(
   call void @capture32(i32* %y)
-  call void @llvm.lifetime.end(i64 -1, i8* %y0)
-  call void @llvm.lifetime.end(i64 -1, i8* %x0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %y0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %x0)
   ret void
 bb3:
-  call void @llvm.lifetime.end(i64 -1, i8* %x0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %x0)
   ret void
 }
 
@@ -270,18 +270,18 @@ entry:
   %y = alloca i32, align 4
   %x0 = bitcast i32* %x to i8*
   %y0 = bitcast i32* %y to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %x0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %x0)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
 ; CHECK:   call void @capture32(
   call void @capture32(i32* %x)
-  call void @llvm.lifetime.end(i64 -1, i8* %x0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %x0)
   br i1 %d, label %bb2, label %bb3
 bb2:
-  call void @llvm.lifetime.start(i64 -1, i8* %y0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %y0)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
 ; CHECK:   call void @capture32(
   call void @capture32(i32* %y)
-  call void @llvm.lifetime.end(i64 -1, i8* %y0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %y0)
   ret void
 bb3:
   ret void
@@ -297,14 +297,14 @@ entry:
   %y = alloca i32, align 4
   %x0 = bitcast i32* %x to i8*
   %y0 = bitcast i32* %y to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %x0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %x0)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
 ; CHECK:   call void @capture32(
   call void @capture32(i32* %x)
-  call void @llvm.lifetime.end(i64 -1, i8* %x0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %x0)
   br i1 %d, label %bb2, label %bb3
 bb2:
-  call void @llvm.lifetime.start(i64 -1, i8* %y0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %y0)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
 ; CHECK:   call void @capture32(
   call void @capture32(i32* %y)
@@ -323,14 +323,14 @@ entry:
   %y = alloca i32, align 4
   %x0 = bitcast i32* %x to i8*
   %y0 = bitcast i32* %y to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %x0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %x0)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
 ; CHECK:   call void @capture32(
   call void @capture32(i32* %x)
   br i1 %d, label %bb2, label %bb3
 bb2:
-  call void @llvm.lifetime.end(i64 -1, i8* %x0)
-  call void @llvm.lifetime.start(i64 -1, i8* %y0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %x0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %y0)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
 ; CHECK:   call void @capture32(
   call void @capture32(i32* %y)
@@ -352,10 +352,10 @@ entry:
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
 ; CHECK:   call void @capture32(
   call void @capture32(i32* %x)
-  call void @llvm.lifetime.end(i64 -1, i8* %x0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %x0)
   br i1 %d, label %bb2, label %bb3
 bb2:
-  call void @llvm.lifetime.start(i64 -1, i8* %y0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %y0)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -8
 ; CHECK:   call void @capture32(
   call void @capture32(i32* %y)
@@ -374,29 +374,29 @@ entry:
   %A.i = alloca [100 x i32], align 4
   %B.i = alloca [100 x i32], align 4
   %0 = bitcast [100 x i32]* %A.i to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %0)
   %1 = bitcast [100 x i32]* %B.i to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %1)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -400
 ; CHECK:   call void @capture100x32(
   call void @capture100x32([100 x i32]* %A.i)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -800
 ; CHECK:   call void @capture100x32(
   call void @capture100x32([100 x i32]* %B.i)
-  call void @llvm.lifetime.end(i64 -1, i8* %0)
-  call void @llvm.lifetime.end(i64 -1, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %1)
   %2 = bitcast [100 x i32]* %A.i1 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %2)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %2)
   %3 = bitcast [100 x i32]* %B.i2 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %3)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %3)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -400
 ; CHECK:   call void @capture100x32(
   call void @capture100x32([100 x i32]* %A.i1)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -800
 ; CHECK:   call void @capture100x32(
   call void @capture100x32([100 x i32]* %B.i2)
-  call void @llvm.lifetime.end(i64 -1, i8* %2)
-  call void @llvm.lifetime.end(i64 -1, i8* %3)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %2)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %3)
   ret void
 }
 
@@ -408,11 +408,11 @@ entry:
   %buf1 = alloca i8, i32 100000, align 16
   %buf2 = alloca i8, i32 100000, align 16
 
-  call void @llvm.lifetime.start(i64 -1, i8* %buf1)
-  call void @llvm.lifetime.end(i64 -1, i8* %buf1)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %buf1)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %buf1)
 
-  call void @llvm.lifetime.start(i64 -1, i8* %buf1)
-  call void @llvm.lifetime.start(i64 -1, i8* %buf2)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %buf1)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %buf2)
   call void @capture8(i8* %buf1)
   call void @capture8(i8* %buf2)
   ret void
@@ -435,13 +435,13 @@ entry:
   %A.i = alloca [100 x i32], align 4
   %B.i = alloca [100 x i32], align 4
   %0 = bitcast [100 x i32]* %A.i to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %0) nounwind
   %1 = bitcast [100 x i32]* %B.i to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %1) nounwind
   call void @capture100x32([100 x i32]* %A.i)
   call void @capture100x32([100 x i32]* %B.i)
-  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
-  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %1) nounwind
   br label %block2
 
 block2:
@@ -464,13 +464,13 @@ entry:
   %b8 = bitcast [4 x %struct.Klass]* %b.i to i8*
   ; I am used outside the lifetime zone below:
   %z2 = getelementptr inbounds [4 x %struct.Klass], [4 x %struct.Klass]* %a.i, i64 0, i64 0, i32 0
-  call void @llvm.lifetime.start(i64 -1, i8* %a8)
-  call void @llvm.lifetime.start(i64 -1, i8* %b8)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %a8)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b8)
   call void @capture8(i8* %a8)
   call void @capture8(i8* %b8)
   %z3 = load i32, i32* %z2, align 16
-  call void @llvm.lifetime.end(i64 -1, i8* %a8)
-  call void @llvm.lifetime.end(i64 -1, i8* %b8)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %a8)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %b8)
   ret i32 %z3
 }
 
@@ -480,12 +480,12 @@ entry:
 ; CHECK:        %[[USP:.*]] = load i8*, i8** @__safestack_unsafe_stack_ptr
 ; CHECK-NEXT:   getelementptr i8, i8* %[[USP]], i32 -16
   %x = alloca i8, align 4
-  call void @llvm.lifetime.start(i64 4, i8* %x) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %x) nounwind
   br label %l2
 
 l2:
   call void @capture8(i8* %x)
-  call void @llvm.lifetime.end(i64 4, i8* %x) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %x) nounwind
   br label %l2
 }
 
@@ -498,23 +498,23 @@ entry:
 ; CHECK-NEXT:   getelementptr i8, i8* %[[USP]], i32 -16
   %x = alloca i8, align 4
   %y = alloca i8, align 4
-  call void @llvm.lifetime.start(i64 4, i8* %x) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %x) nounwind
   br label %l2
 
 l2:
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -8
-  call void @llvm.lifetime.start(i64 4, i8* %y) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %y) nounwind
   call void @capture8(i8* %y)
-  call void @llvm.lifetime.end(i64 4, i8* %y) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %y) nounwind
 
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
-  call void @llvm.lifetime.start(i64 4, i8* %x) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %x) nounwind
   call void @capture8(i8* %x)
   br label %l2
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 declare void @capture8(i8*)
 declare void @capture32(i32*)
 declare void @capture64(i64*)
diff --git a/test/Transforms/SafeStack/X86/debug-loc2.ll b/test/Transforms/SafeStack/X86/debug-loc2.ll
index 35e9b7711d2f..8059a722fd45 100644
--- a/test/Transforms/SafeStack/X86/debug-loc2.ll
+++ b/test/Transforms/SafeStack/X86/debug-loc2.ll
@@ -40,12 +40,12 @@ entry:
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 
 declare void @capture(i32*) #2
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #3
diff --git a/test/Transforms/SafeStack/X86/layout-frag.ll b/test/Transforms/SafeStack/X86/layout-frag.ll
index 125eb0f8be9a..b127defc2c5d 100644
--- a/test/Transforms/SafeStack/X86/layout-frag.ll
+++ b/test/Transforms/SafeStack/X86/layout-frag.ll
@@ -14,16 +14,16 @@ entry:
   %x0a = bitcast i64* %x0 to i8*
   %x2a = bitcast i64* %x2 to i8*
 
-  call void @llvm.lifetime.start(i64 4, i8* %x0a)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %x0a)
   call void @capture64(i64* %x0)
-  call void @llvm.lifetime.end(i64 4, i8* %x0a)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %x0a)
 
-  call void @llvm.lifetime.start(i64 4, i8* %x1)
-  call void @llvm.lifetime.start(i64 4, i8* %x2a)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %x1)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %x2a)
   call void @capture8(i8* %x1)
   call void @capture64(i64* %x2)
-  call void @llvm.lifetime.end(i64 4, i8* %x1)
-  call void @llvm.lifetime.end(i64 4, i8* %x2a)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %x1)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %x2a)
 
 ; Test that i64 allocas share space.
 ; CHECK: getelementptr i8, i8* %unsafe_stack_ptr, i32 -8
@@ -33,7 +33,7 @@ entry:
   ret void
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 declare void @capture8(i8*)
 declare void @capture64(i64*)
diff --git a/test/Transforms/SampleProfile/Inputs/import.prof b/test/Transforms/SampleProfile/Inputs/import.prof
new file mode 100644
index 000000000000..efadc0c5c9c6
--- /dev/null
+++ b/test/Transforms/SampleProfile/Inputs/import.prof
@@ -0,0 +1,4 @@
+main:10000:0
+ 3: foo:1000
+  3: bar:200
+   4: baz:10
diff --git a/test/Transforms/SampleProfile/Inputs/indirect-call.afdo b/test/Transforms/SampleProfile/Inputs/indirect-call.afdo
new file mode 100644
index 000000000000..2d5b345e960e
--- /dev/null
+++ b/test/Transforms/SampleProfile/Inputs/indirect-call.afdo
diff --git a/test/Transforms/SampleProfile/Inputs/indirect-call.prof b/test/Transforms/SampleProfile/Inputs/indirect-call.prof
new file mode 100644
index 000000000000..428d4cedef5a
--- /dev/null
+++ b/test/Transforms/SampleProfile/Inputs/indirect-call.prof
@@ -0,0 +1,13 @@
+test:63067:0
+ 4: 3345 _Z3barv:1398 _Z3foov:2059
+test_inline:3000:0
+ 5: foo_inline1:3000
+  1: 3000
+ 5: foo_inline2:4000
+  1: 4000
+test_noinline:3000:0
+ 5: foo_noinline:3000
+  1: 3000
+test_direct:3000:0
+ 5: foo_direct:3000
+  1: 3000
diff --git a/test/Transforms/SampleProfile/branch.ll b/test/Transforms/SampleProfile/branch.ll
index 2ef01a76b0f0..5a5160e6343a 100644
--- a/test/Transforms/SampleProfile/branch.ll
+++ b/test/Transforms/SampleProfile/branch.ll
@@ -87,7 +87,9 @@ for.cond:                                         ; preds = %for.inc, %if.then.2
   %6 = load i32, i32* %u, align 4, !dbg !46
   %7 = load i32, i32* %limit, align 4, !dbg !48
   %cmp5 = icmp slt i32 %6, %7, !dbg !49
-  br i1 %cmp5, label %for.body, label %for.end, !dbg !50
+  br i1 %cmp5, label %for.body, label %for.end, !dbg !50, !prof !80
+; CHECK: edge for.cond -> for.body probability is 0x73333333 / 0x80000000 = 90.00%
+; CHECK: edge for.cond -> for.end probability is 0x0ccccccd / 0x80000000 = 10.00%
 
 for.body:                                         ; preds = %for.cond
   call void @llvm.dbg.declare(metadata double* %x, metadata !51, metadata !17), !dbg !53
@@ -237,3 +239,4 @@ attributes #4 = { nounwind readonly }
 !77 = !DILocation(line: 20, column: 4, scope: !6)
 !78 = !DILocation(line: 21, column: 4, scope: !6)
 !79 = !DILocation(line: 22, column: 2, scope: !6)
+!80 = !{!"branch_weights", i32 90, i32 10}
diff --git a/test/Transforms/SampleProfile/calls.ll b/test/Transforms/SampleProfile/calls.ll
index 45909ddf3e54..3539c771627a 100644
--- a/test/Transforms/SampleProfile/calls.ll
+++ b/test/Transforms/SampleProfile/calls.ll
@@ -48,8 +48,8 @@ while.cond:                                       ; preds = %if.end, %entry
   store i32 %inc, i32* %i, align 4, !dbg !14
   %cmp = icmp slt i32 %0, 400000000, !dbg !14
   br i1 %cmp, label %while.body, label %while.end, !dbg !14
-; CHECK: edge while.cond -> while.body probability is 0x7d9eb367 / 0x80000000 = 98.14% [HOT edge]
-; CHECK: edge while.cond -> while.end probability is 0x02614c99 / 0x80000000 = 1.86%
+; CHECK: edge while.cond -> while.body probability is 0x77f2798d / 0x80000000 = 93.71% [HOT edge]
+; CHECK: edge while.cond -> while.end probability is 0x080d8673 / 0x80000000 = 6.29%
 
 while.body:                                       ; preds = %while.cond
   %1 = load i32, i32* %i, align 4, !dbg !16
@@ -59,8 +59,8 @@ while.body:                                       ; preds = %while.cond
 ; both branches out of while.body had the same weight. In reality,
 ; the edge while.body->if.then is taken most of the time.
 ;
-; CHECK: edge while.body -> if.else probability is 0x00059704 / 0x80000000 = 0.02%
-; CHECK: edge while.body -> if.then probability is 0x7ffa68fc / 0x80000000 = 99.98% [HOT edge]
+; CHECK: edge while.body -> if.else probability is 0x0005b1e0 / 0x80000000 = 0.02%
+; CHECK: edge while.body -> if.then probability is 0x7ffa4e20 / 0x80000000 = 99.98% [HOT edge]
 
 
 if.then:                                          ; preds = %while.body
@@ -103,14 +103,14 @@ declare i32 @printf(i8*, ...) #2
 !12 = !DILocation(line: 8, scope: !7)
 !13 = !DILocation(line: 9, scope: !7)
 !14 = !DILocation(line: 9, scope: !15)
-!15 = !DILexicalBlockFile(discriminator: 1, file: !1, scope: !7)
+!15 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !7)
 !16 = !DILocation(line: 10, scope: !17)
 !17 = distinct !DILexicalBlock(line: 10, column: 0, file: !1, scope: !7)
 !18 = !DILocation(line: 10, scope: !19)
-!19 = !DILexicalBlockFile(discriminator: 1, file: !1, scope: !17)
+!19 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !17)
 !20 = !DILocation(line: 10, scope: !21)
-!21 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !17)
+!21 = !DILexicalBlockFile(discriminator: 4, file: !1, scope: !17)
 !22 = !DILocation(line: 10, scope: !23)
-!23 = !DILexicalBlockFile(discriminator: 3, file: !1, scope: !17)
+!23 = !DILexicalBlockFile(discriminator: 6, file: !1, scope: !17)
 !24 = !DILocation(line: 11, scope: !7)
 !25 = !DILocation(line: 12, scope: !7)
diff --git a/test/Transforms/SampleProfile/cov-zero-samples.ll b/test/Transforms/SampleProfile/cov-zero-samples.ll
index 7ccaa3e7d756..5239d74fdc6e 100644
--- a/test/Transforms/SampleProfile/cov-zero-samples.ll
+++ b/test/Transforms/SampleProfile/cov-zero-samples.ll
@@ -106,7 +106,7 @@ attributes #0 = { nounwind readnone }
 !13 = !{!14, !14}
 !14 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !15 = !DILocation(line: 5, column: 27, scope: !16)
-!16 = !DILexicalBlockFile(scope: !11, file: !3, discriminator: 3)
+!16 = !DILexicalBlockFile(scope: !11, file: !3, discriminator: 6)
 !17 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 7, type: !18, isLocal: false, isDefinition: true, scopeLine: 7, flags: DIFlagPrototyped, isOptimized: false, unit: !2, variables: !4)
 !18 = !DISubroutineType(types: !19)
 !19 = !{!14}
@@ -118,7 +118,7 @@ attributes #0 = { nounwind readnone }
 !25 = !DILocation(line: 9, column: 18, scope: !24)
 !26 = !DILocation(line: 9, column: 8, scope: !24)
 !27 = !DILocation(line: 9, column: 25, scope: !28)
-!28 = !DILexicalBlockFile(scope: !29, file: !3, discriminator: 1)
+!28 = !DILexicalBlockFile(scope: !29, file: !3, discriminator: 2)
 !29 = distinct !DILexicalBlock(scope: !24, file: !3, line: 9, column: 3)
 !30 = !DILocation(line: 9, column: 29, scope: !28)
 !31 = !DILocation(line: 9, column: 27, scope: !28)
@@ -130,7 +130,7 @@ attributes #0 = { nounwind readnone }
 !37 = !DILocation(line: 10, column: 11, scope: !34)
 !38 = !DILocation(line: 10, column: 9, scope: !35)
 !39 = !DILocation(line: 10, column: 36, scope: !40)
-!40 = !DILexicalBlockFile(scope: !34, file: !3, discriminator: 1)
+!40 = !DILexicalBlockFile(scope: !34, file: !3, discriminator: 2)
 !41 = !DILocation(line: 10, column: 23, scope: !40)
 !42 = !DILocation(line: 10, column: 20, scope: !40)
 !43 = !DILocation(line: 10, column: 16, scope: !40)
@@ -139,7 +139,7 @@ attributes #0 = { nounwind readnone }
 !46 = !DILocation(line: 11, column: 9, scope: !35)
 !47 = !DILocation(line: 12, column: 3, scope: !35)
 !48 = !DILocation(line: 9, column: 33, scope: !49)
-!49 = !DILexicalBlockFile(scope: !29, file: !3, discriminator: 2)
+!49 = !DILexicalBlockFile(scope: !29, file: !3, discriminator: 4)
 !50 = !DILocation(line: 9, column: 3, scope: !49)
 !51 = !DILocation(line: 13, column: 25, scope: !17)
 !52 = !DILocation(line: 13, column: 3, scope: !17)
diff --git a/test/Transforms/SampleProfile/discriminator.ll b/test/Transforms/SampleProfile/discriminator.ll
index d0b96a9ea16e..85f6cbe8fb4a 100644
--- a/test/Transforms/SampleProfile/discriminator.ll
+++ b/test/Transforms/SampleProfile/discriminator.ll
@@ -79,12 +79,12 @@ while.end:                                        ; preds = %while.cond
 !10 = !DILocation(line: 2, scope: !4)
 !11 = !DILocation(line: 3, scope: !4)
 !12 = !DILocation(line: 3, scope: !13)
-!13 = !DILexicalBlockFile(discriminator: 1, file: !1, scope: !4)
+!13 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !4)
 !14 = !DILocation(line: 4, scope: !15)
 !15 = distinct !DILexicalBlock(line: 4, column: 0, file: !1, scope: !16)
 !16 = distinct !DILexicalBlock(line: 3, column: 0, file: !1, scope: !4)
 !17 = !DILocation(line: 4, scope: !18)
-!18 = !DILexicalBlockFile(discriminator: 1, file: !1, scope: !15)
+!18 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !15)
 !19 = !DILocation(line: 5, scope: !16)
 !20 = !DILocation(line: 6, scope: !16)
 !21 = !DILocation(line: 7, scope: !4)
diff --git a/test/Transforms/SampleProfile/early-inline.ll b/test/Transforms/SampleProfile/early-inline.ll
index 780ff4751f40..51e7d243c187 100644
--- a/test/Transforms/SampleProfile/early-inline.ll
+++ b/test/Transforms/SampleProfile/early-inline.ll
@@ -41,8 +41,8 @@ declare i32 @__gxx_personality_v0(...)
 !1 = !DIFile(filename: "a", directory: "b/")
 !3 = !{i32 2, !"Dwarf Version", i32 4}
 !4 = !{i32 2, !"Debug Info Version", i32 3}
-!6 = distinct !DISubprogram(linkageName: "_Z3foov", scope: !1, line: 5, scopeLine: 5, unit: !0)
+!6 = distinct !DISubprogram(linkageName: "_Z3foov", scope: !1, file: !1, line: 5, scopeLine: 5, unit: !0)
 !9 = !DILocation(line: 6, column: 3, scope: !6)
 !10 = !DILocation(line: 8, column: 5, scope: !11)
 !11 = distinct !DILexicalBlock(scope: !6, file: !1, line: 7, column: 7)
-!12 = distinct !DISubprogram(linkageName: "_ZL3barv", scope: !1, line: 20, scopeLine: 20, unit: !0)
+!12 = distinct !DISubprogram(linkageName: "_ZL3barv", scope: !1, file: !1, line: 20, scopeLine: 20, unit: !0)
diff --git a/test/Transforms/SampleProfile/fnptr.ll b/test/Transforms/SampleProfile/fnptr.ll
index 0c671a7882f6..1b01d0c0c857 100644
--- a/test/Transforms/SampleProfile/fnptr.ll
+++ b/test/Transforms/SampleProfile/fnptr.ll
@@ -8,10 +8,10 @@
 ; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/fnptr.prof | opt -analyze -branch-prob | FileCheck %s
 ; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/fnptr.binprof | opt -analyze -branch-prob | FileCheck %s
 
-; CHECK:   edge for.body3 -> if.then probability is 0x19f584f3 / 0x80000000 = 20.28%
-; CHECK:   edge for.body3 -> if.else probability is 0x660a7b0d / 0x80000000 = 79.72%
-; CHECK:   edge for.inc -> for.inc12 probability is 0x000f92fb / 0x80000000 = 0.05%
-; CHECK:   edge for.inc -> for.body3 probability is 0x7ff06d05 / 0x80000000 = 99.95%
+; CHECK:   edge for.body3 -> if.then probability is 0x1a56a56a / 0x80000000 = 20.58%
+; CHECK:   edge for.body3 -> if.else probability is 0x65a95a96 / 0x80000000 = 79.42%
+; CHECK:   edge for.inc -> for.inc12 probability is 0x000fbd1c / 0x80000000 = 0.05%
+; CHECK:   edge for.inc -> for.body3 probability is 0x7ff042e4 / 0x80000000 = 99.95%
 ; CHECK:   edge for.inc12 -> for.end14 probability is 0x04000000 / 0x80000000 = 3.12%
 ; CHECK:   edge for.inc12 -> for.cond1.preheader probability is 0x7c000000 / 0x80000000 = 96.88%
 
diff --git a/test/Transforms/SampleProfile/import.ll b/test/Transforms/SampleProfile/import.ll
new file mode 100644
index 000000000000..1ee45fb4fd3e
--- /dev/null
+++ b/test/Transforms/SampleProfile/import.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/import.prof -S | FileCheck %s
+
+; Tests whether the functions in the inline stack are added to the
+; function_entry_count metadata.
+
+declare void @foo()
+
+define void @main() !dbg !7 {
+  call void @foo(), !dbg !18
+  ret void
+}
+
+; GUIDs of foo and bar should be included in the metadata to make sure hot
+; inline stacks are imported.
+; CHECK: !{!"function_entry_count", i64 1, i64 6699318081062747564, i64 -2012135647395072713}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!10}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5 ", isOptimized: false, emissionKind: NoDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "calls.cc", directory: ".")
+!2 = !{}
+!6 = !DISubroutineType(types: !2)
+!7 = distinct !DISubprogram(name: "main", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 7, file: !1, scope: !1, type: !6, variables: !2)
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 1, !"Debug Info Version", i32 3}
+!10 = !{!"clang version 3.5 "}
+!15 = !DILexicalBlockFile(discriminator: 1, file: !1, scope: !7)
+!17 = distinct !DILexicalBlock(line: 10, column: 0, file: !1, scope: !7)
+!18 = !DILocation(line: 10, scope: !17)
diff --git a/test/Transforms/SampleProfile/indirect-call-gcc.ll b/test/Transforms/SampleProfile/indirect-call-gcc.ll
new file mode 100644
index 000000000000..678c7931250e
--- /dev/null
+++ b/test/Transforms/SampleProfile/indirect-call-gcc.ll
@@ -0,0 +1,26 @@
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/indirect-call.afdo -S | FileCheck %s
+
+; Checks if indirect call targets are read correctly when reading from gcc
+; format profile.
+; It is expected to fail on certain architectures as gcc profile reader does
+; not work.
+; XFAIL: powerpc64-, s390x, mips-, mips64-, sparc
+
+define void @test(void ()*) !dbg !3 {
+  %2 = alloca void ()*
+  store void ()* %0, void ()** %2
+  %3 = load void ()*, void ()** %2
+  ; CHECK: call {{.*}}, !prof ![[PROF:[0-9]+]]
+  call void %3(), !dbg !4
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1)
+!1 = !DIFile(filename: "test.cc", directory: "/")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, unit: !0)
+!4 = !DILocation(line: 5, scope: !3)
+; CHECK: ![[PROF]] = !{!"VP", i32 0, i64 3457, i64 9191153033785521275, i64 2059, i64 -1069303473483922844, i64 1398}
diff --git a/test/Transforms/SampleProfile/indirect-call.ll b/test/Transforms/SampleProfile/indirect-call.ll
new file mode 100644
index 000000000000..5a4913d6358f
--- /dev/null
+++ b/test/Transforms/SampleProfile/indirect-call.ll
@@ -0,0 +1,82 @@
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/indirect-call.prof -S | FileCheck %s
+
+; CHECK-LABEL: @test
+define void @test(void ()*) !dbg !3 {
+  %2 = alloca void ()*
+  store void ()* %0, void ()** %2
+  %3 = load void ()*, void ()** %2
+  ; CHECK: call {{.*}}, !prof ![[PROF:[0-9]+]]
+  call void %3(), !dbg !4
+  ret void
+}
+
+; CHECK-LABEL: @test_inline
+; If the indirect call is promoted and inlined in profile, we should promote and inline it.
+define void @test_inline(i64* (i32*)*, i32* %x) !dbg !3 {
+  %2 = alloca i64* (i32*)*
+  store i64* (i32*)* %0, i64* (i32*)** %2
+  %3 = load i64* (i32*)*, i64* (i32*)** %2
+; CHECK: icmp {{.*}} @foo_inline2
+; CHECK: if.true.direct_targ:
+; CHECK-NOT: call
+; CHECK: if.false.orig_indirect:
+; CHECK: icmp {{.*}} @foo_inline1
+; CHECK: if.true.direct_targ1:
+; CHECK-NOT: call
+; CHECK: if.false.orig_indirect2:
+; CHECK: call
+  call i64* %3(i32* %x), !dbg !5
+  ret void
+}
+
+; CHECK-LABEL: @test_noinline
+; If the indirect call target is not available, we should not promote it.
+define void @test_noinline(void ()*) !dbg !3 {
+  %2 = alloca void ()*
+  store void ()* %0, void ()** %2
+  %3 = load void ()*, void ()** %2
+; CHECK-NOT: icmp
+; CHECK: call
+  call void %3(), !dbg !5
+  ret void
+}
+
+@x = global i32 0, align 4
+
+define i32* @foo_inline1(i32* %x) !dbg !3 {
+  ret i32* %x
+}
+
+define i32* @foo_inline2(i32* %x) !dbg !3 {
+  ret i32* %x
+}
+
+define i32 @foo_noinline(i32 %x) !dbg !3 {
+  ret i32 %x
+}
+
+define void @foo_direct() !dbg !3 {
+  ret void
+}
+
+; CHECK-LABEL: @test_direct
+; We should not promote a direct call.
+define void @test_direct() !dbg !3 {
+; CHECK-NOT: icmp
+; CHECK: call
+  call void @foo_alias(), !dbg !5
+  ret void
+}
+
+@foo_alias = alias void (), void ()* @foo_direct
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1)
+!1 = !DIFile(filename: "test.cc", directory: "/")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, unit: !0)
+!4 = !DILocation(line: 5, scope: !3)
+!5 = !DILocation(line: 6, scope: !3)
+; CHECK: ![[PROF]] = !{!"VP", i32 0, i64 3457, i64 9191153033785521275, i64 2059, i64 -1069303473483922844, i64 1398}
diff --git a/test/Transforms/SampleProfile/inline-coverage.ll b/test/Transforms/SampleProfile/inline-coverage.ll
index c88e7f865fa2..080876a46471 100644
--- a/test/Transforms/SampleProfile/inline-coverage.ll
+++ b/test/Transforms/SampleProfile/inline-coverage.ll
@@ -16,7 +16,7 @@
 ;    11      return sum > 0 ? 0 : 1;
 ;    12    }
 ;
-; CHECK: remark: coverage.cc:10:12: inlined hot callee '_Z3fool' with 172746 samples into 'main'
+; CHECK: remark: coverage.cc:10:12: inlined hot callee '_Z3fool' into 'main'
 ; CHECK: remark: coverage.cc:9:21: Applied 23478 samples from profile (offset: 2.1)
 ; CHECK: remark: coverage.cc:10:16: Applied 23478 samples from profile (offset: 3)
 ; CHECK: remark: coverage.cc:4:10: Applied 31878 samples from profile (offset: 1)
@@ -120,7 +120,7 @@ for.end:                                          ; preds = %for.cond
 !27 = !DILocation(line: 9, column: 12, scope: !26)
 !28 = !DILocation(line: 9, column: 8, scope: !26)
 !29 = !DILocation(line: 9, column: 19, scope: !30)
-!30 = !DILexicalBlockFile(scope: !31, file: !1, discriminator: 1)
+!30 = !DILexicalBlockFile(scope: !31, file: !1, discriminator: 2)
 !31 = distinct !DILexicalBlock(scope: !26, file: !1, line: 9, column: 3)
 !32 = !DILocation(line: 9, column: 21, scope: !30)
 !33 = !DILocation(line: 9, column: 3, scope: !30)
diff --git a/test/Transforms/SampleProfile/inline.ll b/test/Transforms/SampleProfile/inline.ll
index ed353834137b..3ed8988968f6 100644
--- a/test/Transforms/SampleProfile/inline.ll
+++ b/test/Transforms/SampleProfile/inline.ll
@@ -96,14 +96,14 @@ declare i32 @printf(i8*, ...) #2
 !12 = !DILocation(line: 8, scope: !7)
 !13 = !DILocation(line: 9, scope: !7)
 !14 = !DILocation(line: 9, scope: !15)
-!15 = !DILexicalBlockFile(discriminator: 1, file: !1, scope: !7)
+!15 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !7)
 !16 = !DILocation(line: 10, scope: !17)
 !17 = distinct !DILexicalBlock(line: 10, column: 0, file: !1, scope: !7)
 !18 = !DILocation(line: 10, scope: !19)
-!19 = !DILexicalBlockFile(discriminator: 1, file: !1, scope: !17)
+!19 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !17)
 !20 = !DILocation(line: 10, scope: !21)
-!21 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !17)
+!21 = !DILexicalBlockFile(discriminator: 4, file: !1, scope: !17)
 !22 = !DILocation(line: 10, scope: !23)
-!23 = !DILexicalBlockFile(discriminator: 3, file: !1, scope: !17)
+!23 = !DILexicalBlockFile(discriminator: 6, file: !1, scope: !17)
 !24 = !DILocation(line: 11, scope: !7)
 !25 = !DILocation(line: 12, scope: !7)
diff --git a/test/Transforms/SampleProfile/propagate.ll b/test/Transforms/SampleProfile/propagate.ll
index 45e3b8003ffc..5a4922bde935 100644
--- a/test/Transforms/SampleProfile/propagate.ll
+++ b/test/Transforms/SampleProfile/propagate.ll
@@ -244,7 +244,7 @@ attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-
 !31 = !DILocation(line: 7, column: 15, scope: !29)
 !32 = !DILocation(line: 7, column: 10, scope: !29)
 !33 = !DILocation(line: 7, column: 22, scope: !34)
-!34 = !DILexicalBlockFile(scope: !35, file: !1, discriminator: 1)
+!34 = !DILexicalBlockFile(scope: !35, file: !1, discriminator: 2)
 !35 = distinct !DILexicalBlock(scope: !29, file: !1, line: 7, column: 5)
 !36 = !DILocation(line: 7, column: 26, scope: !34)
 !37 = !DILocation(line: 7, column: 24, scope: !34)
@@ -275,7 +275,7 @@ attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-
 !62 = !DILocation(line: 14, column: 24, scope: !59)
 !63 = !DILocation(line: 14, column: 14, scope: !59)
 !64 = !DILocation(line: 14, column: 31, scope: !65)
-!65 = !DILexicalBlockFile(scope: !66, file: !1, discriminator: 1)
+!65 = !DILexicalBlockFile(scope: !66, file: !1, discriminator: 2)
 !66 = distinct !DILexicalBlock(scope: !59, file: !1, line: 14, column: 9)
 !67 = !DILocation(line: 14, column: 33, scope: !65)
 !68 = !DILocation(line: 14, column: 9, scope: !65)
@@ -285,11 +285,11 @@ attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-
 !72 = !DILocation(line: 16, column: 13, scope: !70)
 !73 = !DILocation(line: 17, column: 9, scope: !70)
 !74 = !DILocation(line: 14, column: 41, scope: !75)
-!75 = !DILexicalBlockFile(scope: !66, file: !1, discriminator: 2)
+!75 = !DILexicalBlockFile(scope: !66, file: !1, discriminator: 4)
 !76 = !DILocation(line: 14, column: 9, scope: !75)
 !77 = !DILocation(line: 19, column: 5, scope: !41)
 !78 = !DILocation(line: 7, column: 30, scope: !79)
-!79 = !DILexicalBlockFile(scope: !35, file: !1, discriminator: 2)
+!79 = !DILexicalBlockFile(scope: !35, file: !1, discriminator: 4)
 !80 = !DILocation(line: 7, column: 5, scope: !79)
 !81 = !DILocation(line: 21, column: 10, scope: !6)
 !82 = !DILocation(line: 21, column: 14, scope: !6)
@@ -313,5 +313,5 @@ attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-
 !100 = !DILocation(line: 28, column: 57, scope: !86)
 !101 = !DILocation(line: 28, column: 47, scope: !86)
 !102 = !DILocation(line: 28, column: 3, scope: !103)
-!103 = !DILexicalBlockFile(scope: !86, file: !1, discriminator: 1)
+!103 = !DILexicalBlockFile(scope: !86, file: !1, discriminator: 2)
 !104 = !DILocation(line: 29, column: 3, scope: !86)
diff --git a/test/Transforms/SampleProfile/remarks.ll b/test/Transforms/SampleProfile/remarks.ll
index 908e4f8b10b4..dfb075ee00ea 100644
--- a/test/Transforms/SampleProfile/remarks.ll
+++ b/test/Transforms/SampleProfile/remarks.ll
@@ -19,7 +19,7 @@
 
 ; We are expecting foo() to be inlined in main() (almost all the cycles are
 ; spent inside foo).
-; CHECK: remark: remarks.cc:13:21: inlined hot callee '_Z3foov' with 623868 samples into 'main'
+; CHECK: remark: remarks.cc:13:21: inlined hot callee '_Z3foov' into 'main'
 
 ; The back edge for the loop is the hottest edge in the loop subgraph.
 ; CHECK: remark: remarks.cc:6:9: most popular destination for conditional branches at remarks.cc:5:3
@@ -33,11 +33,11 @@ entry:
   %sum = alloca i64, align 8
   %i = alloca i32, align 4
   %0 = bitcast i64* %sum to i8*, !dbg !19
-  call void @llvm.lifetime.start(i64 8, i8* %0) #4, !dbg !19
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* %0) #4, !dbg !19
   call void @llvm.dbg.declare(metadata i64* %sum, metadata !9, metadata !20), !dbg !21
   store i64 0, i64* %sum, align 8, !dbg !21, !tbaa !22
   %1 = bitcast i32* %i to i8*, !dbg !26
-  call void @llvm.lifetime.start(i64 4, i8* %1) #4, !dbg !26
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) #4, !dbg !26
   call void @llvm.dbg.declare(metadata i32* %i, metadata !10, metadata !20), !dbg !27
   store i32 0, i32* %i, align 4, !dbg !27, !tbaa !28
   br label %for.cond, !dbg !26
@@ -49,7 +49,7 @@ for.cond:                                         ; preds = %for.inc, %entry
 
 for.cond.cleanup:                                 ; preds = %for.cond
   %3 = bitcast i32* %i to i8*, !dbg !36
-  call void @llvm.lifetime.end(i64 4, i8* %3) #4, !dbg !36
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %3) #4, !dbg !36
   br label %for.end
 
 for.body:                                         ; preds = %for.cond
@@ -88,12 +88,12 @@ for.inc:                                          ; preds = %if.end
 for.end:                                          ; preds = %for.cond.cleanup
   %10 = load i64, i64* %sum, align 8, !dbg !53, !tbaa !22
   %11 = bitcast i64* %sum to i8*, !dbg !54
-  call void @llvm.lifetime.end(i64 8, i8* %11) #4, !dbg !54
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* %11) #4, !dbg !54
   ret i64 %10, !dbg !55
 }
 
 ; Function Attrs: nounwind argmemonly
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
@@ -102,7 +102,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
 declare i32 @rand() #3
 
 ; Function Attrs: nounwind argmemonly
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 ; Function Attrs: nounwind uwtable
 define i32 @main() #0 !dbg !13 {
diff --git a/test/Transforms/Scalarizer/vector-gep.ll b/test/Transforms/Scalarizer/vector-gep.ll
new file mode 100644
index 000000000000..eacddf136a32
--- /dev/null
+++ b/test/Transforms/Scalarizer/vector-gep.ll
@@ -0,0 +1,122 @@
+; RUN: opt -S -scalarizer %s | FileCheck %s
+
+; Check that the scalarizer can handle vector GEPs with scalar indices
+
+@vec = global <4 x i16*> <i16* null, i16* null, i16* null, i16* null>
+@index = global i16 1
+@ptr = global [4 x i16] [i16 1, i16 2, i16 3, i16 4]
+@ptrptr = global i16* null
+
+; constant index
+define void @test1() {
+bb:
+  %0 = load <4 x i16*>, <4 x i16*>* @vec
+  %1 = getelementptr i16, <4 x i16*> %0, i16 1
+
+  ret void
+}
+
+;CHECK-LABEL: @test1
+;CHECK: %[[I0:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 0
+;CHECK: getelementptr i16, i16* %[[I0]], i16 1
+;CHECK: %[[I1:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 1
+;CHECK: getelementptr i16, i16* %[[I1]], i16 1
+;CHECK: %[[I2:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 2
+;CHECK: getelementptr i16, i16* %[[I2]], i16 1
+;CHECK: %[[I3:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 3
+;CHECK: getelementptr i16, i16* %[[I3]], i16 1
+
+; non-constant index
+define void @test2() {
+bb:
+  %0 = load <4 x i16*>, <4 x i16*>* @vec
+  %index = load i16, i16* @index
+  %1 = getelementptr i16, <4 x i16*> %0, i16 %index
+
+  ret void
+}
+
+;CHECK-LABEL: @test2
+;CHECK: %0 = load <4 x i16*>, <4 x i16*>* @vec
+;CHECK: %[[I0:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 0
+;CHECK: %[[I1:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 1
+;CHECK: %[[I2:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 2
+;CHECK: %[[I3:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 3
+;CHECK: %index = load i16, i16* @index
+;CHECK: %.splatinsert = insertelement <4 x i16> undef, i16 %index, i32 0
+;CHECK: %.splat = shufflevector <4 x i16> %.splatinsert, <4 x i16> undef, <4 x i32> zeroinitializer
+;CHECK: %.splat[[I0]] = extractelement <4 x i16> %.splat, i32 0
+;CHECK: getelementptr i16, i16* %[[I0]], i16 %.splat[[I0]]
+;CHECK: %.splat[[I1]] = extractelement <4 x i16> %.splat, i32 1
+;CHECK: getelementptr i16, i16* %[[I1]], i16 %.splat[[I1]]
+;CHECK: %.splat[[I2]] = extractelement <4 x i16> %.splat, i32 2
+;CHECK: getelementptr i16, i16* %[[I2]], i16 %.splat[[I2]]
+;CHECK: %.splat[[I3]] = extractelement <4 x i16> %.splat, i32 3
+;CHECK: getelementptr i16, i16* %[[I3]], i16 %.splat[[I3]]
+
+
+; Check that the scalarizer can handle vector GEPs with scalar pointer
+
+; constant pointer
+define void @test3() {
+bb:
+  %0 = bitcast [4 x i16]* @ptr to i16*
+  %1 = getelementptr i16, i16* %0, <4 x i16> <i16 0, i16 1, i16 2, i16 3>
+
+  ret void
+}
+
+;CHECK-LABEL: @test3
+;CHECK: %0 = bitcast [4 x i16]* @ptr to i16*
+;CHECK: %.splatinsert = insertelement <4 x i16*> undef, i16* %0, i32 0
+;CHECK: %.splat = shufflevector <4 x i16*> %.splatinsert, <4 x i16*> undef, <4 x i32> zeroinitializer
+;CHECK: %.splat[[I0:.i[0-9]*]] = extractelement <4 x i16*> %.splat, i32 0
+;CHECK: getelementptr i16, i16* %.splat[[I0]], i16 0
+;CHECK: %.splat[[I1:.i[0-9]*]] = extractelement <4 x i16*> %.splat, i32 1
+;CHECK: getelementptr i16, i16* %.splat[[I1]], i16 1
+;CHECK: %.splat[[I2:.i[0-9]*]] = extractelement <4 x i16*> %.splat, i32 2
+;CHECK: getelementptr i16, i16* %.splat[[I2]], i16 2
+;CHECK: %.splat[[I3:.i[0-9]*]] = extractelement <4 x i16*> %.splat, i32 3
+;CHECK: getelementptr i16, i16* %.splat[[I3]], i16 3
+
+; non-constant pointer
+define void @test4() {
+bb:
+  %0 = load i16*, i16** @ptrptr
+  %1 = getelementptr i16, i16* %0, <4 x i16> <i16 0, i16 1, i16 2, i16 3>
+
+  ret void
+}
+
+;CHECK-LABEL: @test4
+;CHECK: %0 = load i16*, i16** @ptrptr
+;CHECK: %.splatinsert = insertelement <4 x i16*> undef, i16* %0, i32 0
+;CHECK: %.splat = shufflevector <4 x i16*> %.splatinsert, <4 x i16*> undef, <4 x i32> zeroinitializer
+;CHECK: %.splat[[I0:.i[0-9]*]] = extractelement <4 x i16*> %.splat, i32 0
+;CHECK: getelementptr i16, i16* %.splat[[I0]], i16 0
+;CHECK: %.splat[[I1:.i[0-9]*]] = extractelement <4 x i16*> %.splat, i32 1
+;CHECK: getelementptr i16, i16* %.splat[[I1]], i16 1
+;CHECK: %.splat[[I2:.i[0-9]*]] = extractelement <4 x i16*> %.splat, i32 2
+;CHECK: getelementptr i16, i16* %.splat[[I2]], i16 2
+;CHECK: %.splat[[I3:.i[0-9]*]] = extractelement <4 x i16*> %.splat, i32 3
+;CHECK: getelementptr i16, i16* %.splat[[I3]], i16 3
+
+; constant index, inbounds
+define void @test5() {
+bb:
+  %0 = load <4 x i16*>, <4 x i16*>* @vec
+  %1 = getelementptr inbounds i16, <4 x i16*> %0, i16 1
+
+  ret void
+}
+
+;CHECK-LABEL: @test5
+;CHECK: %[[I0:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 0
+;CHECK: getelementptr inbounds i16, i16* %[[I0]], i16 1
+;CHECK: %[[I1:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 1
+;CHECK: getelementptr inbounds i16, i16* %[[I1]], i16 1
+;CHECK: %[[I2:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 2
+;CHECK: getelementptr inbounds i16, i16* %[[I2]], i16 1
+;CHECK: %[[I3:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 3
+;CHECK: getelementptr inbounds i16, i16* %[[I3]], i16 1
+
diff --git a/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll b/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
index 5815ae627373..23ec0ca25544 100644
--- a/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
+++ b/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
@@ -9,7 +9,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 1
 ; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 32
 ; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 33
-define void @sum_of_array(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
+define amdgpu_kernel void @sum_of_array(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
   %tmp = sext i32 %y to i64
   %tmp1 = sext i32 %x to i64
   %tmp2 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %tmp1, i64 %tmp
@@ -42,7 +42,7 @@ define void @sum_of_array(i32 %x, i32 %y, float addrspace(1)* nocapture %output)
 ; IR: add i32 %x, 256
 ; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
 ; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-define void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
+define amdgpu_kernel void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
   %tmp = sext i32 %y to i64
   %tmp1 = sext i32 %x to i64
   %tmp2 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %tmp1, i64 %tmp
@@ -74,7 +74,7 @@ define void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(
 ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 255
 ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 16128
 ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 16383
-define void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
+define amdgpu_kernel void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
   %tmp2 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(3)* @lds_array, i32 0, i32 %x, i32 %y
   %tmp4 = load float, float addrspace(3)* %tmp2, align 4
   %tmp5 = fadd float %tmp4, 0.000000e+00
diff --git a/test/Transforms/SimplifyCFG/ARM/switch-to-lookup-table.ll b/test/Transforms/SimplifyCFG/ARM/switch-to-lookup-table.ll
index 16f028d2e85a..90a9aa4d95b7 100644
--- a/test/Transforms/SimplifyCFG/ARM/switch-to-lookup-table.ll
+++ b/test/Transforms/SimplifyCFG/ARM/switch-to-lookup-table.ll
@@ -1,8 +1,8 @@
-; RUN: opt -S -simplifycfg -mtriple=arm -relocation-model=static    < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
-; RUN: opt -S -simplifycfg -mtriple=arm -relocation-model=pic       < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
-; RUN: opt -S -simplifycfg -mtriple=arm -relocation-model=ropi      < %s | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
-; RUN: opt -S -simplifycfg -mtriple=arm -relocation-model=rwpi      < %s | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
-; RUN: opt -S -simplifycfg -mtriple=arm -relocation-model=ropi-rwpi < %s | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+; RUN: opt -S -latesimplifycfg -mtriple=arm -relocation-model=static    < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
+; RUN: opt -S -latesimplifycfg -mtriple=arm -relocation-model=pic       < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
+; RUN: opt -S -latesimplifycfg -mtriple=arm -relocation-model=ropi      < %s | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+; RUN: opt -S -latesimplifycfg -mtriple=arm -relocation-model=rwpi      < %s | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+; RUN: opt -S -latesimplifycfg -mtriple=arm -relocation-model=ropi-rwpi < %s | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
 
 ; CHECK:       @{{.*}} = private unnamed_addr constant [3 x i32] [i32 1234, i32 5678, i32 15532]
 ; ENABLE:      @{{.*}} = private unnamed_addr constant [3 x i32*] [i32* @c1, i32* @c2, i32* @c3]
diff --git a/test/Transforms/SimplifyCFG/CoveredLookupTable.ll b/test/Transforms/SimplifyCFG/CoveredLookupTable.ll
index 8b45a590bb1f..a42349e3d874 100644
--- a/test/Transforms/SimplifyCFG/CoveredLookupTable.ll
+++ b/test/Transforms/SimplifyCFG/CoveredLookupTable.ll
@@ -1,4 +1,4 @@
-; RUN: opt -simplifycfg -S %s | FileCheck %s
+; RUN: opt -latesimplifycfg -S %s | FileCheck %s
 ; rdar://15268442
 
 target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/test/Transforms/SimplifyCFG/X86/switch-covered-bug.ll b/test/Transforms/SimplifyCFG/X86/switch-covered-bug.ll
index f3e5506ad933..ae6ff6d10bcf 100644
--- a/test/Transforms/SimplifyCFG/X86/switch-covered-bug.ll
+++ b/test/Transforms/SimplifyCFG/X86/switch-covered-bug.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -simplifycfg < %s -mtriple=x86_64-apple-darwin12.0.0 | FileCheck %s
+; RUN: opt -S -latesimplifycfg < %s -mtriple=x86_64-apple-darwin12.0.0 | FileCheck %s
 ; rdar://17887153
 target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin12.0.0"
diff --git a/test/Transforms/SimplifyCFG/X86/switch-table-bug.ll b/test/Transforms/SimplifyCFG/X86/switch-table-bug.ll
index 26008700f5be..734312bc7285 100644
--- a/test/Transforms/SimplifyCFG/X86/switch-table-bug.ll
+++ b/test/Transforms/SimplifyCFG/X86/switch-table-bug.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -simplifycfg < %s -mtriple=x86_64-apple-darwin12.0.0 | FileCheck %s
+; RUN: opt -S -latesimplifycfg < %s -mtriple=x86_64-apple-darwin12.0.0 | FileCheck %s
 ; rdar://17735071
 target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin12.0.0"
diff --git a/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
index 81c153483c66..4b9227b029ec 100644
--- a/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
+++ b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -simplifycfg -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+; RUN: opt < %s -latesimplifycfg -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -1178,8 +1178,9 @@ return:
   ret i32 %retval.0
 ; CHECK-LABEL: @reuse_cmp2(
 ; CHECK: entry:
-; CHECK-NEXT: %switch.tableidx = sub i32 %x, 0
-; CHECK-NEXT: [[C:%.+]] = icmp ult i32 %switch.tableidx, 4
+; CHECK-NEXT: %switch = icmp ult i32 %x, 4
+; CHECK-NEXT: %x. = select i1 %switch, i32 %x, i32 4
+; CHECK-NEXT: [[C:%.+]] = icmp ne i32 %x., 4
 ; CHECK:      [[R:%.+]] = select i1 [[C]], i32 {{.*}}, i32 100
 ; CHECK-NEXT: ret i32 [[R]]
 }
diff --git a/test/Transforms/SimplifyCFG/critedge-assume.ll b/test/Transforms/SimplifyCFG/critedge-assume.ll
new file mode 100644
index 000000000000..bfeb65769deb
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/critedge-assume.ll
@@ -0,0 +1,83 @@
+; RUN: opt -o %t %s -instcombine -simplifycfg -thinlto-bc -verify-assumption-cache
+; RUN: llvm-dis -o - %t | FileCheck %s
+
+; Test that the simplifycfg pass correctly updates the assumption cache
+; when it clones the llvm.assume call as part of creating a critical
+; edge. To do that, we set up a pass pipeline such that (1) an assumption
+; cache is created for foo before simplifycfg updates it, and (2) foo's
+; assumption cache is verified after simplifycfg has run. To satisfy 1, we
+; run the instcombine pass first in our pipeline. To satisfy 2, we use the
+; ThinLTOBitcodeWriter pass to write bitcode (that pass uses the assumption
+; cache). That ensures that the pass manager does not call releaseMemory()
+; on the AssumptionCacheTracker before the end of the pipeline, which would
+; wipe out the bad assumption cache before it is verified.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%class.F = type { i8 }
+%class.B = type { i8 }
+%class.A = type { %class.C }
+%class.C = type { i32 (...)** }
+
+define void @foo(%class.F* %this, %class.B* %out) {
+entry:
+  %call = tail call i32 @_ZNK1F5beginEv(%class.F* %this)
+  %call2 = tail call i32 @_ZNK1F3endEv(%class.F* %this)
+  %cmp.i22 = icmp eq i32 %call, %call2
+  br i1 %cmp.i22, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %frame_node.sroa.0.023 = phi i32 [ %inc.i, %_ZN10unique_ptrD2Ev.exit ], [ %call, %while.body.preheader ]
+  %call8 = tail call i8* @_Znwm(i64 8)
+  %inc.i = add nsw i32 %frame_node.sroa.0.023, 1
+  %cmp = icmp eq i32 %inc.i, %call2
+  br i1 %cmp, label %_ZN10unique_ptrD2Ev.exit, label %if.then
+
+if.then:
+  tail call void @_ZN1B6appendEv(%class.B* %out)
+  br label %_ZN10unique_ptrD2Ev.exit
+
+_ZN10unique_ptrD2Ev.exit:
+  %x1 = bitcast i8* %call8 to void (%class.A*)***
+  %vtable.i.i = load void (%class.A*)**, void (%class.A*)*** %x1, align 8
+  %x2 = bitcast void (%class.A*)** %vtable.i.i to i8*
+  %x3 = tail call i1 @llvm.type.test(i8* %x2, metadata !"foo")
+  ; CHECK: call void @llvm.assume
+  ; CHECK: call void @llvm.assume
+  tail call void @llvm.assume(i1 %x3) #5
+  br i1 %cmp, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+
+declare i32 @_ZNK1F5beginEv(%class.F*)
+
+declare i32 @_ZNK1F3endEv(%class.F*)
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+
+declare noalias nonnull i8* @_Znwm(i64)
+
+declare void @_ZN1B6appendEv(%class.B*)
+
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
+
+declare i1 @llvm.type.test(i8*, metadata)
+
+declare void @llvm.assume(i1)
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 5.0.0 "}
diff --git a/test/Transforms/SimplifyCFG/div-rem-pairs.ll b/test/Transforms/SimplifyCFG/div-rem-pairs.ll
new file mode 100644
index 000000000000..85ffe1f4e0f3
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/div-rem-pairs.ll
@@ -0,0 +1,119 @@
+; RUN: opt -simplifycfg -S < %s | FileCheck %s
+
+; FIXME: Hoist the sdiv because it's safe and free.
+; PR31028 - https://bugs.llvm.org/show_bug.cgi?id=31028
+
+define i32 @hoist_sdiv(i32 %a, i32 %b) {
+; CHECK-LABEL: @hoist_sdiv(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[REM:%.*]] = srem i32 %a, %b
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[REM]], 42
+; CHECK-NEXT:    br i1 [[CMP]], label %if, label %end
+; CHECK:       if:
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 %a, %b
+; CHECK-NEXT:    br label %end
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = phi i32 [ [[DIV]], %if ], [ 3, %entry ]
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+entry:
+  %rem = srem i32 %a, %b
+  %cmp = icmp eq i32 %rem, 42
+  br i1 %cmp, label %if, label %end
+
+if:
+  %div = sdiv i32 %a, %b
+  br label %end
+
+end:
+  %ret = phi i32 [ %div, %if ], [ 3, %entry ]
+  ret i32 %ret
+}
+
+; FIXME: Hoist the udiv because it's safe and free.
+
+define i64 @hoist_udiv(i64 %a, i64 %b) {
+; CHECK-LABEL: @hoist_udiv(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[REM:%.*]] = urem i64 %a, %b
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[REM]], 42
+; CHECK-NEXT:    br i1 [[CMP]], label %if, label %end
+; CHECK:       if:
+; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 %a, %b
+; CHECK-NEXT:    br label %end
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = phi i64 [ [[DIV]], %if ], [ 3, %entry ]
+; CHECK-NEXT:    ret i64 [[RET]]
+;
+entry:
+  %rem = urem i64 %a, %b
+  %cmp = icmp eq i64 %rem, 42
+  br i1 %cmp, label %if, label %end
+
+if:
+  %div = udiv i64 %a, %b
+  br label %end
+
+end:
+  %ret = phi i64 [ %div, %if ], [ 3, %entry ]
+  ret i64 %ret
+}
+
+; FIXME: Hoist the srem because it's safe and likely free.
+
+define i16 @hoist_srem(i16 %a, i16 %b) {
+; CHECK-LABEL: @hoist_srem(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv i16 %a, %b
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[DIV]], 42
+; CHECK-NEXT:    br i1 [[CMP]], label %if, label %end
+; CHECK:       if:
+; CHECK-NEXT:    [[REM:%.*]] = srem i16 %a, %b
+; CHECK-NEXT:    br label %end
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = phi i16 [ [[REM]], %if ], [ 3, %entry ]
+; CHECK-NEXT:    ret i16 [[RET]]
+;
+entry:
+  %div = sdiv i16 %a, %b
+  %cmp = icmp eq i16 %div, 42
+  br i1 %cmp, label %if, label %end
+
+if:
+  %rem = srem i16 %a, %b
+  br label %end
+
+end:
+  %ret = phi i16 [ %rem, %if ], [ 3, %entry ]
+  ret i16 %ret
+}
+
+; FIXME: Hoist the urem because it's safe and likely free.
+
+define i8 @hoist_urem(i8 %a, i8 %b) {
+; CHECK-LABEL: @hoist_urem(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = udiv i8 %a, %b
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[DIV]], 42
+; CHECK-NEXT:    br i1 [[CMP]], label %if, label %end
+; CHECK:       if:
+; CHECK-NEXT:    [[REM:%.*]] = urem i8 %a, %b
+; CHECK-NEXT:    br label %end
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = phi i8 [ [[REM]], %if ], [ 3, %entry ]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  %div = udiv i8 %a, %b
+  %cmp = icmp eq i8 %div, 42
+  br i1 %cmp, label %if, label %end
+
+if:
+  %rem = urem i8 %a, %b
+  br label %end
+
+end:
+  %ret = phi i8 [ %rem, %if ], [ 3, %entry ]
+  ret i8 %ret
+}
+
diff --git a/test/Transforms/SimplifyCFG/empty-cleanuppad.ll b/test/Transforms/SimplifyCFG/empty-cleanuppad.ll
index 9f657a81a05b..f2e0114a2a35 100644
--- a/test/Transforms/SimplifyCFG/empty-cleanuppad.ll
+++ b/test/Transforms/SimplifyCFG/empty-cleanuppad.ll
@@ -413,14 +413,14 @@ return:                                           ; preds = %invoke.cont, %catch
 define i32 @f9() personality i32 (...)* @__CxxFrameHandler3 {
 entry:
   %s = alloca i8, align 1
-  call void @llvm.lifetime.start(i64 1, i8* nonnull %s)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* nonnull %s)
   %bc = bitcast i8* %s to %struct.S2*
   invoke void @"\01??1S2@@QEAA@XZ"(%struct.S2* %bc)
           to label %try.cont unwind label %ehcleanup
 
 ehcleanup:
   %cleanup.pad = cleanuppad within none []
-  call void @llvm.lifetime.end(i64 1, i8* nonnull %s)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* nonnull %s)
   cleanupret from %cleanup.pad unwind label %catch.dispatch
 
 catch.dispatch:
@@ -466,5 +466,5 @@ declare void @use_x(i32 %x)
 
 declare i32 @__CxxFrameHandler3(...)
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
diff --git a/test/Transforms/SimplifyCFG/lifetime.ll b/test/Transforms/SimplifyCFG/lifetime.ll
index 7c66be529500..270fe4d54422 100644
--- a/test/Transforms/SimplifyCFG/lifetime.ll
+++ b/test/Transforms/SimplifyCFG/lifetime.ll
@@ -10,11 +10,11 @@
 define void @foo(i1 %x) {
 entry:
   %a = alloca i8
-  call void @llvm.lifetime.start(i64 -1, i8* %a) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %a) nounwind
   br i1 %x, label %bb0, label %bb1
 
 bb0:
-  call void @llvm.lifetime.end(i64 -1, i8* %a) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %a) nounwind
   br label %bb1
 
 bb1:
@@ -24,6 +24,6 @@ bb1:
 
 declare void @f()
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
 
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
diff --git a/test/Transforms/SimplifyCFG/merge-cond-stores.ll b/test/Transforms/SimplifyCFG/merge-cond-stores.ll
index 77e3158d9bbd..d5d0224a4b24 100644
--- a/test/Transforms/SimplifyCFG/merge-cond-stores.ll
+++ b/test/Transforms/SimplifyCFG/merge-cond-stores.ll
@@ -1,16 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -simplifycfg -instcombine < %s -simplifycfg-merge-cond-stores=true -simplifycfg-merge-cond-stores-aggressively=false -phi-node-folding-threshold=2 -S | FileCheck %s
 
-; CHECK-LABEL: @test_simple
 ; This test should succeed and end up if-converted.
-; CHECK: icmp eq i32 %b, 0
-; CHECK-NEXT: icmp ne i32 %a, 0
-; CHECK-NEXT: xor i1 %x2, true
-; CHECK-NEXT: %[[x:.*]] = or i1 %{{.*}}, %{{.*}}
-; CHECK-NEXT: br i1 %[[x]]
-; CHECK: store
-; CHECK-NOT: store
-; CHECK: ret
 define void @test_simple(i32* %p, i32 %a, i32 %b) {
+; CHECK-LABEL: @test_simple(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[X2:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i1 [[X2]], true
+; CHECK-NEXT:    [[TMP2:%.*]] = or i1 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; CHECK:         [[NOT_X2:%.*]] = xor i1 [[X2]], true
+; CHECK-NEXT:    [[DOT:%.*]] = zext i1 [[NOT_X2]] to i32
+; CHECK-NEXT:    store i32 [[DOT]], i32* [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[TMP4]]
+; CHECK:         ret void
+;
 entry:
   %x1 = icmp eq i32 %a, 0
   br i1 %x1, label %fallthrough, label %yes1
@@ -31,12 +36,26 @@ end:
   ret void
 }
 
-; CHECK-LABEL: @test_recursive
 ; This test should entirely fold away, leaving one large basic block.
-; CHECK: store
-; CHECK-NOT: store
-; CHECK: ret
 define void @test_recursive(i32* %p, i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: @test_recursive(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = or i32 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[X4:%.*]] = icmp eq i32 [[D:%.*]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[TMP0]], [[C:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[X4]], true
+; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[TMP5:%.*]], label [[TMP6:%.*]]
+; CHECK:         [[X3:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT:    [[NOT_X2:%.*]] = icmp ne i32 [[B]], 0
+; CHECK-NEXT:    [[DOT:%.*]] = zext i1 [[NOT_X2]] to i32
+; CHECK-NEXT:    [[DOT_:%.*]] = select i1 [[X3]], i32 [[DOT]], i32 2
+; CHECK-NEXT:    [[DOT__:%.*]] = select i1 [[X4]], i32 [[DOT_]], i32 3
+; CHECK-NEXT:    store i32 [[DOT__]], i32* [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[TMP6]]
+; CHECK:         ret void
+;
 entry:
   %x1 = icmp eq i32 %a, 0
   br i1 %x1, label %fallthrough, label %yes1
@@ -74,13 +93,31 @@ end:
   ret void
 }
 
-; CHECK-LABEL: @test_not_ifconverted
 ; The code in each diamond is too large - it won't be if-converted so our
 ; heuristics should say no.
-; CHECK: store
-; CHECK: store
-; CHECK: ret
 define void @test_not_ifconverted(i32* %p, i32 %a, i32 %b) {
+; CHECK-LABEL: @test_not_ifconverted(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[X1]], label [[FALLTHROUGH:%.*]], label [[YES1:%.*]]
+; CHECK:       yes1:
+; CHECK-NEXT:    [[Y1:%.*]] = or i32 [[B:%.*]], 55
+; CHECK-NEXT:    [[Y2:%.*]] = add i32 [[Y1]], 24
+; CHECK-NEXT:    [[Y3:%.*]] = and i32 [[Y2]], 67
+; CHECK-NEXT:    store i32 [[Y3]], i32* [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[FALLTHROUGH]]
+; CHECK:       fallthrough:
+; CHECK-NEXT:    [[X2:%.*]] = icmp eq i32 [[B]], 0
+; CHECK-NEXT:    br i1 [[X2]], label [[END:%.*]], label [[YES2:%.*]]
+; CHECK:       yes2:
+; CHECK-NEXT:    [[Z1:%.*]] = or i32 [[A]], 55
+; CHECK-NEXT:    [[Z2:%.*]] = add i32 [[Z1]], 24
+; CHECK-NEXT:    [[Z3:%.*]] = and i32 [[Z2]], 67
+; CHECK-NEXT:    store i32 [[Z3]], i32* [[P]], align 4
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
 entry:
   %x1 = icmp eq i32 %a, 0
   br i1 %x1, label %fallthrough, label %yes1
@@ -107,13 +144,26 @@ end:
   ret void
 }
 
-; CHECK-LABEL: @test_aliasing1
 ; The store to %p clobbers the previous store, so if-converting this would
 ; be illegal.
-; CHECK: store
-; CHECK: store
-; CHECK: ret
 define void @test_aliasing1(i32* %p, i32 %a, i32 %b) {
+; CHECK-LABEL: @test_aliasing1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[X1]], label [[FALLTHROUGH:%.*]], label [[YES1:%.*]]
+; CHECK:       yes1:
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[FALLTHROUGH]]
+; CHECK:       fallthrough:
+; CHECK-NEXT:    [[Y1:%.*]] = load i32, i32* [[P]], align 4
+; CHECK-NEXT:    [[X2:%.*]] = icmp eq i32 [[Y1]], 0
+; CHECK-NEXT:    br i1 [[X2]], label [[END:%.*]], label [[YES2:%.*]]
+; CHECK:       yes2:
+; CHECK-NEXT:    store i32 1, i32* [[P]], align 4
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
 entry:
   %x1 = icmp eq i32 %a, 0
   br i1 %x1, label %fallthrough, label %yes1
@@ -135,12 +185,25 @@ end:
   ret void
 }
 
-; CHECK-LABEL: @test_aliasing2
 ; The load from %q aliases with %p, so if-converting this would be illegal.
-; CHECK: store
-; CHECK: store
-; CHECK: ret
 define void @test_aliasing2(i32* %p, i32* %q, i32 %a, i32 %b) {
+; CHECK-LABEL: @test_aliasing2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[X1]], label [[FALLTHROUGH:%.*]], label [[YES1:%.*]]
+; CHECK:       yes1:
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[FALLTHROUGH]]
+; CHECK:       fallthrough:
+; CHECK-NEXT:    [[Y1:%.*]] = load i32, i32* [[Q:%.*]], align 4
+; CHECK-NEXT:    [[X2:%.*]] = icmp eq i32 [[Y1]], 0
+; CHECK-NEXT:    br i1 [[X2]], label [[END:%.*]], label [[YES2:%.*]]
+; CHECK:       yes2:
+; CHECK-NEXT:    store i32 1, i32* [[P]], align 4
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
 entry:
   %x1 = icmp eq i32 %a, 0
   br i1 %x1, label %fallthrough, label %yes1
@@ -164,12 +227,24 @@ end:
 
 declare void @f()
 
-; CHECK-LABEL: @test_diamond_simple
 ; This should get if-converted.
-; CHECK: store
-; CHECK-NOT: store
-; CHECK: ret
 define i32 @test_diamond_simple(i32* %p, i32* %q, i32 %a, i32 %b) {
+; CHECK-LABEL: @test_diamond_simple(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[Z1:%.*]] = add i32 [[A]], [[B:%.*]]
+; CHECK-NEXT:    [[Z2:%.*]] = select i1 [[X1]], i32 [[Z1]], i32 0
+; CHECK-NEXT:    [[X2:%.*]] = icmp eq i32 [[B]], 0
+; CHECK-NEXT:    [[Z3:%.*]] = sub i32 [[Z2]], [[B]]
+; CHECK-NEXT:    [[Z4:%.*]] = select i1 [[X2]], i32 [[Z3]], i32 3
+; CHECK-NEXT:    [[TMP0:%.*]] = or i32 [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP3:%.*]], label [[TMP2:%.*]]
+; CHECK:         [[SIMPLIFYCFG_MERGE:%.*]] = select i1 [[X2]], i32 [[Z2]], i32 1
+; CHECK-NEXT:    store i32 [[SIMPLIFYCFG_MERGE]], i32* [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[TMP3]]
+; CHECK:         ret i32 [[Z4]]
+;
 entry:
   %x1 = icmp eq i32 %a, 0
   br i1 %x1, label %no1, label %yes1
@@ -200,14 +275,36 @@ end:
   ret i32 %z4
 }
 
-; CHECK-LABEL: @test_diamond_alias3
 ; Now there is a call to f() in the bottom branch. The store in the first
 ; branch would now be reordered with respect to the call if we if-converted,
 ; so we must not.
-; CHECK: store
-; CHECK: store
-; CHECK: ret
 define i32 @test_diamond_alias3(i32* %p, i32* %q, i32 %a, i32 %b) {
+; CHECK-LABEL: @test_diamond_alias3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[X1]], label [[NO1:%.*]], label [[YES1:%.*]]
+; CHECK:       yes1:
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[FALLTHROUGH:%.*]]
+; CHECK:       no1:
+; CHECK-NEXT:    call void @f()
+; CHECK-NEXT:    [[Z1:%.*]] = add i32 [[A]], [[B:%.*]]
+; CHECK-NEXT:    br label [[FALLTHROUGH]]
+; CHECK:       fallthrough:
+; CHECK-NEXT:    [[Z2:%.*]] = phi i32 [ [[Z1]], [[NO1]] ], [ 0, [[YES1]] ]
+; CHECK-NEXT:    [[X2:%.*]] = icmp eq i32 [[B]], 0
+; CHECK-NEXT:    br i1 [[X2]], label [[NO2:%.*]], label [[YES2:%.*]]
+; CHECK:       yes2:
+; CHECK-NEXT:    store i32 1, i32* [[P]], align 4
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       no2:
+; CHECK-NEXT:    call void @f()
+; CHECK-NEXT:    [[Z3:%.*]] = sub i32 [[Z2]], [[B]]
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[Z4:%.*]] = phi i32 [ [[Z3]], [[NO2]] ], [ 3, [[YES2]] ]
+; CHECK-NEXT:    ret i32 [[Z4]]
+;
 entry:
   %x1 = icmp eq i32 %a, 0
   br i1 %x1, label %no1, label %yes1
diff --git a/test/Transforms/SimplifyCFG/rangereduce.ll b/test/Transforms/SimplifyCFG/rangereduce.ll
index 36e932b37be5..13bbdfe83d07 100644
--- a/test/Transforms/SimplifyCFG/rangereduce.ll
+++ b/test/Transforms/SimplifyCFG/rangereduce.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -simplifycfg -S | FileCheck %s
+; RUN: opt < %s -latesimplifycfg -S | FileCheck %s
 
 target datalayout = "e-n32"
 
diff --git a/test/Transforms/SimplifyCFG/remove-debug-2.ll b/test/Transforms/SimplifyCFG/remove-debug-2.ll
new file mode 100644
index 000000000000..6362f53e14c1
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/remove-debug-2.ll
@@ -0,0 +1,68 @@
+; RUN: opt < %s -simplifycfg -S | FileCheck %s
+
+; Check if the debug info for hoisted store for "ret = 0" is removed
+;
+; int foo(int x) {
+;   int ret = 1;
+;   if (x)
+;     ret = 0;
+;   return ret;
+; }
+;
+; CHECK: store i32 1,{{.+}}!dbg ![[DLOC1:[0-9]+]]
+; CHECK: icmp ne {{.+}}!dbg ![[DLOC2:[0-9]+]]
+; CHECK: [[VREG:%[^ ]+]] = select
+; CHECK: store i32 [[VREG]]
+; CHECK-NOT: !dbg
+; CHECK-SAME: {{$}}
+; CHECK: ret {{.+}}!dbg ![[DLOC3:[0-9]+]]
+; CHECK: ![[DLOC1]] = !DILocation(line: 2
+; CHECK: ![[DLOC2]] = !DILocation(line: 3
+; CHECK: ![[DLOC3]] = !DILocation(line: 5
+
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define i32 @foo(i32) !dbg !6 {
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  store i32 %0, i32* %2, align 4
+  store i32 1, i32* %3, align 4, !dbg !14
+  %4 = load i32, i32* %2, align 4, !dbg !15
+  %5 = icmp ne i32 %4, 0, !dbg !15
+  br i1 %5, label %6, label %7, !dbg !17
+
+; <label>:6:                                      ; preds = %1
+  store i32 0, i32* %3, align 4, !dbg !18
+  br label %7, !dbg !19
+
+; <label>:7:                                      ; preds = %6, %1
+  %8 = load i32, i32* %3, align 4, !dbg !20
+  ret i32 %8, !dbg !21
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1)
+!1 = !DIFile(filename: "foo.c", directory: "b/")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{}
+!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !8)
+!8 = !{!9, !9}
+!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!10 = !DILocalVariable(name: "x", arg: 1, scope: !6, file: !1, line: 1, type: !9)
+!11 = !DIExpression()
+!12 = !DILocation(line: 1, column: 13, scope: !6)
+!13 = !DILocalVariable(name: "ret", scope: !6, file: !1, line: 2, type: !9)
+!14 = !DILocation(line: 2, column: 7, scope: !6)
+!15 = !DILocation(line: 3, column: 7, scope: !16)
+!16 = distinct !DILexicalBlock(scope: !6, file: !1, line: 3, column: 7)
+!17 = !DILocation(line: 3, column: 7, scope: !6)
+!18 = !DILocation(line: 4, column: 9, scope: !16)
+!19 = !DILocation(line: 4, column: 5, scope: !16)
+!20 = !DILocation(line: 5, column: 10, scope: !6)
+!21 = !DILocation(line: 5, column: 3, scope: !6)
diff --git a/test/Transforms/SimplifyCFG/switch_create.ll b/test/Transforms/SimplifyCFG/switch_create.ll
index 29d3a34a05e6..c752636ae83d 100644
--- a/test/Transforms/SimplifyCFG/switch_create.ll
+++ b/test/Transforms/SimplifyCFG/switch_create.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -simplifycfg < %s | FileCheck %s
-; RUN: opt -S -default-data-layout="p:32:32-p1:16:16" -simplifycfg < %s | FileCheck -check-prefix=CHECK -check-prefix=DL %s
+; RUN: opt -S -data-layout="p:32:32-p1:16:16" -simplifycfg < %s | FileCheck -check-prefix=CHECK -check-prefix=DL %s
 
 declare void @foo1()
 
diff --git a/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll b/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll
index f2853aca698f..9554ae690316 100644
--- a/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll
+++ b/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll
@@ -6,7 +6,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; CHECK-LABEL: @slsr_after_reassociate_global_geps_mubuf_max_offset(
 ; CHECK: [[b1:%[0-9]+]] = getelementptr float, float addrspace(1)* %arr, i64 [[bump:%[0-9]+]]
 ; CHECK: [[b2:%[0-9]+]] = getelementptr float, float addrspace(1)* [[b1]], i64 [[bump]]
-define void @slsr_after_reassociate_global_geps_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) {
+define amdgpu_kernel void @slsr_after_reassociate_global_geps_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) {
 bb:
   %i2 = shl nsw i32 %i, 1
   %j1 = add nsw i32 %i, 1023
@@ -33,7 +33,7 @@ bb:
 ; CHECK: %tmp = sext i32 %j1 to i64
 ; CHECK: getelementptr inbounds float, float addrspace(1)* %arr, i64 %tmp
 ; CHECK: getelementptr inbounds float, float addrspace(1)* %arr, i64 %tmp5
-define void @slsr_after_reassociate_global_geps_over_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) {
+define amdgpu_kernel void @slsr_after_reassociate_global_geps_over_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) {
 bb:
   %i2 = shl nsw i32 %i, 1
   %j1 = add nsw i32 %i, 1024
@@ -61,7 +61,7 @@ bb:
 
 ; CHECK: [[B2:%[0-9]+]] = getelementptr float, float addrspace(3)* [[B1]], i32 %i
 ; CHECK: getelementptr inbounds float, float addrspace(3)* [[B2]], i32 16383
-define void @slsr_after_reassociate_lds_geps_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) {
+define amdgpu_kernel void @slsr_after_reassociate_lds_geps_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) {
 bb:
   %i2 = shl nsw i32 %i, 1
   %j1 = add nsw i32 %i, 16383
@@ -86,7 +86,7 @@ bb:
 ; CHECK: getelementptr inbounds float, float addrspace(3)* %arr, i32 %j1
 ; CHECK: %j2 = add i32 %j1, %i
 ; CHECK: getelementptr inbounds float, float addrspace(3)* %arr, i32 %j2
-define void @slsr_after_reassociate_lds_geps_over_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) {
+define amdgpu_kernel void @slsr_after_reassociate_lds_geps_over_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) {
 bb:
   %i2 = shl nsw i32 %i, 1
   %j1 = add nsw i32 %i, 16384
diff --git a/test/Transforms/StripSymbols/strip-dead-debug-info.ll b/test/Transforms/StripSymbols/strip-dead-debug-info.ll
index 0e252d70465e..d18c07d54a90 100644
--- a/test/Transforms/StripSymbols/strip-dead-debug-info.ll
+++ b/test/Transforms/StripSymbols/strip-dead-debug-info.ll
@@ -3,6 +3,9 @@
 ; CHECK: ModuleID = '{{.*}}'
 ; CHECK-NOT: "bar"
 ; CHECK-NOT: "abcd"
+; CHECK-NOT: "GCC"
+; CHECK: "Globals"
+; CHECK: "abcd2"
 
 source_filename = "test/Transforms/StripSymbols/strip-dead-debug-info.ll"
 
@@ -21,7 +24,7 @@ entry:
 define i32 @foo(i32 %i) #2 !dbg !15 {
 entry:
   tail call void @llvm.dbg.value(metadata i32 %i, i64 0, metadata !18, metadata !19), !dbg !20
-  %.0 = load i32, i32* @xyz, align 4
+  %.0 = load i32, i32* @xyz, align 4, !dbg !30
   ret i32 %.0, !dbg !21
 }
 
@@ -29,7 +32,7 @@ attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind readnone ssp }
 attributes #2 = { nounwind readonly ssp }
 
-!llvm.dbg.cu = !{!4}
+!llvm.dbg.cu = !{!4, !23, !24, !28}
 !llvm.module.flags = !{!9}
 
 !0 = !DIGlobalVariableExpression(var: !1)
@@ -55,4 +58,11 @@ attributes #2 = { nounwind readonly ssp }
 !20 = !DILocation(line: 7, scope: !15)
 !21 = !DILocation(line: 10, scope: !22)
 !22 = distinct !DILexicalBlock(scope: !15, file: !2, line: 7)
-
+!23 = distinct !DICompileUnit(language: DW_LANG_C89, file: !2, producer: "GCC", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !5, retainedTypes: !5, globals: !5)
+!24 = distinct !DICompileUnit(language: DW_LANG_C89, file: !2, producer: "Globals", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !5, retainedTypes: !5, globals: !25)
+!25 = !{!26}
+!26 = !DIGlobalVariableExpression(var: !27, expr: !DIExpression(DW_OP_constu, 0, DW_OP_stack_value))
+!27 = !DIGlobalVariable(name: "abcd2", scope: !2, file: !2, line: 2, type: !3, isLocal: true, isDefinition: true)
+!28 = distinct !DICompileUnit(language: DW_LANG_C89, file: !2, producer: "InlineTest", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !5, retainedTypes: !5, globals: !5)
+!29 = distinct !DISubprogram(name: "inlinefunc", linkageName: "inlinefunc", scope: null, file: !2, line: 7, type: !16, isLocal: false, isDefinition: true, isOptimized: true, unit: !28)
+!30 = !DILocation(line: 100, scope: !29, inlinedAt: !21)
diff --git a/test/Transforms/StructurizeCFG/rebuild-ssa-infinite-loop.ll b/test/Transforms/StructurizeCFG/rebuild-ssa-infinite-loop.ll
index a635be10d465..9d3a84396cfc 100644
--- a/test/Transforms/StructurizeCFG/rebuild-ssa-infinite-loop.ll
+++ b/test/Transforms/StructurizeCFG/rebuild-ssa-infinite-loop.ll
@@ -6,46 +6,51 @@
 
 target triple = "amdgcn--"
 
-declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #0
-declare <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #2
-
-define amdgpu_vs void @wrapper(i32 inreg, i32) {
+define amdgpu_vs void @wrapper(i32 inreg %arg, i32 %arg1) {
 main_body:
-  %2 = add i32 %1, %0
-  %3 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> undef, i32 0, i32 %2)
-  %4 = extractelement <4 x float> %3, i32 1
-  %5 = fptosi float %4 to i32
-  %6 = insertelement <2 x i32> undef, i32 %5, i32 1
+  %tmp = add i32 %arg1, %arg
+  %tmp2 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %tmp, i32 0, i1 false, i1 false)
+  %tmp3 = extractelement <4 x float> %tmp2, i32 1
+  %tmp4 = fptosi float %tmp3 to i32
+  %tmp5 = insertelement <2 x i32> undef, i32 %tmp4, i32 1
   br label %loop11.i
 
 loop11.i:                                         ; preds = %endif46.i, %main_body
-  %7 = phi i32 [ 0, %main_body ], [ %15, %endif46.i ]
-  %8 = icmp sgt i32 %7, 999
-  br i1 %8, label %main.exit, label %if16.i
+  %tmp6 = phi i32 [ 0, %main_body ], [ %tmp14, %endif46.i ]
+  %tmp7 = icmp sgt i32 %tmp6, 999
+  br i1 %tmp7, label %main.exit, label %if16.i
 
 if16.i:                                           ; preds = %loop11.i
-  %9 = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %6, <8 x i32> undef, i32 15, i1 true, i1 false, i1 false, i1 false)
-  %10 = extractelement <4 x float> %9, i32 0
-  %11 = fcmp ult float 0.000000e+00, %10
-  br i1 %11, label %if28.i, label %endif46.i
+  %tmp8 = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %tmp5, <8 x i32> undef, i32 15, i1 true, i1 false, i1 false, i1 false)
+  %tmp9 = extractelement <4 x float> %tmp8, i32 0
+  %tmp10 = fcmp ult float 0.000000e+00, %tmp9
+  br i1 %tmp10, label %if28.i, label %endif46.i
 
 if28.i:                                           ; preds = %if16.i
-  %12 = bitcast float %10 to i32
-  %13 = shl i32 %12, 16
-  %14 = bitcast i32 %13 to float
+  %tmp11 = bitcast float %tmp9 to i32
+  %tmp12 = shl i32 %tmp11, 16
+  %tmp13 = bitcast i32 %tmp12 to float
   br label %main.exit
 
 endif46.i:                                        ; preds = %if16.i
-  %15 = add i32 %7, 1
+  %tmp14 = add i32 %tmp6, 1
   br label %loop11.i
 
 main.exit:                                        ; preds = %if28.i, %loop11.i
-  %16 = phi float [ %14, %if28.i ], [ 0x36F0800000000000, %loop11.i ]
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %16, float 0.000000e+00, float 0.000000e+00, float 0x36A0000000000000)
+  %tmp15 = phi float [ %tmp13, %if28.i ], [ 0x36F0800000000000, %loop11.i ]
+  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp15, float 0.000000e+00, float 0.000000e+00, float 0x36A0000000000000, i1 false, i1 false) #0
   ret void
 }
 
-attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind readonly }
-attributes #2 = { nounwind }
+; Function Attrs: nounwind
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #2
+
+; Function Attrs: nounwind readonly
+declare <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
diff --git a/test/Transforms/ThinLTOBitcodeWriter/circular-reference.ll b/test/Transforms/ThinLTOBitcodeWriter/circular-reference.ll
new file mode 100644
index 000000000000..eeda79324497
--- /dev/null
+++ b/test/Transforms/ThinLTOBitcodeWriter/circular-reference.ll
@@ -0,0 +1,9 @@
+; RUN: opt -thinlto-bc -o %t %s
+; RUN: llvm-modextract -b -n 0 -o - %t | llvm-dis | FileCheck --check-prefix=M0 %s
+; RUN: llvm-modextract -b -n 1 -o - %t | llvm-dis | FileCheck --check-prefix=M1 %s
+
+; M0: @g = external constant
+; M1: @g = constant
+@g = constant i8* bitcast (i8** @g to i8*), !type !0
+
+!0 = !{i32 0, !"typeid"}
diff --git a/test/Transforms/ThinLTOBitcodeWriter/comdat.ll b/test/Transforms/ThinLTOBitcodeWriter/comdat.ll
new file mode 100644
index 000000000000..caea48e0a543
--- /dev/null
+++ b/test/Transforms/ThinLTOBitcodeWriter/comdat.ll
@@ -0,0 +1,80 @@
+; RUN: opt -thinlto-bc -o %t %s
+; RUN: llvm-modextract -n 0 -o - %t | llvm-dis | FileCheck --check-prefix=THIN %s
+; RUN: llvm-modextract -n 1 -o - %t | llvm-dis | FileCheck --check-prefix=MERGED %s
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.0.24215"
+
+; Internal comdat leader with type metadata. All comdat members need to live
+; in the merged module, and the comdat needs to be renamed.
+; MERGED: ${{"?lwt[^ ]+}} = comdat any
+$lwt = comdat any
+
+; External comdat leader, type metadata on non-leader. All comdat
+; members need to live in the merged module, internal members need to
+; be renamed.
+; MERGED: $nlwt = comdat any
+$nlwt = comdat any
+
+; Comdat with two members without type metadata. All comdat members live in
+; the ThinLTO module and no renaming needs to take place.
+; THIN: $nt = comdat any
+$nt = comdat any
+
+; MERGED: @lwt_aliasee = private unnamed_addr global
+; MERGED-SAME: comdat(${{"?lwt[^ ]+}})
+@lwt_aliasee = private unnamed_addr global [1 x i8*] [i8* null], comdat($lwt), !type !0
+
+; MERGED: {{@"?lwt_nl[^ ]+}} = hidden unnamed_addr global
+; MERGED-SAME: comdat(${{"?lwt[^ ]+}})
+; THIN: {{@"?lwt_nl[^ ]+}} = external hidden
+@lwt_nl = internal unnamed_addr global i32 0, comdat($lwt)
+
+; MERGED: @nlwt_aliasee = private unnamed_addr global
+; MERGED-SAME: comdat($nlwt)
+@nlwt_aliasee = private unnamed_addr global [1 x i8*] [i8* null], comdat($nlwt), !type !0
+
+; MERGED: @nlwt = unnamed_addr global
+; MERGED-SAME: comdat
+; THIN: @nlwt = external
+@nlwt = unnamed_addr global i32 0, comdat
+
+; THIN: @nt = internal
+; THIN-SAME: comdat
+@nt = internal unnamed_addr global [1 x i8*] [i8* null], comdat
+
+; THIN: @nt_nl = internal
+; THIN-SAME: comdat($nt)
+@nt_nl = internal unnamed_addr global i32 0, comdat($nt)
+
+; MERGED: {{@"?lwt[^ ]+}} = hidden unnamed_addr alias
+; THIN: {{@"?lwt[^ ]+}} = external hidden
+@lwt = internal unnamed_addr alias [1 x i8*], [1 x i8*]* @lwt_aliasee
+
+; MERGED: {{@"?nlwt_nl[^ ]+}} = hidden unnamed_addr alias
+; THIN: {{@"?nlwt_nl[^ ]+}} = external hidden
+@nlwt_nl = internal unnamed_addr alias [1 x i8*], [1 x i8*]* @nlwt_aliasee
+
+; The functions below exist just to make sure the globals are used.
+define i8* @lwt_fun() {
+  %1 = load i32, i32* @lwt_nl
+  %2 = getelementptr inbounds [1 x i8*], [1 x i8*]* @lwt, i32 0, i32 %1
+  %3 = load i8*, i8** %2
+  ret i8* %3
+}
+
+define i8* @nlwt_fun() {
+  %1 = load i32, i32* @nlwt
+  %2 = getelementptr inbounds [1 x i8*], [1 x i8*]* @nlwt_nl, i32 0, i32 %1
+  %3 = load i8*, i8** %2
+  ret i8* %3
+}
+
+define i8* @nt_fun() {
+  %1 = load i32, i32* @nt_nl
+  %2 = getelementptr inbounds [1 x i8*], [1 x i8*]* @nt, i32 0, i32 %1
+  %3 = load i8*, i8** %2
+  ret i8* %3
+}
+
+!0 = !{i64 8, !"?AVA@@"}
diff --git a/test/Transforms/ThinLTOBitcodeWriter/filter-alias.ll b/test/Transforms/ThinLTOBitcodeWriter/filter-alias.ll
new file mode 100644
index 000000000000..d555ab0c1f6d
--- /dev/null
+++ b/test/Transforms/ThinLTOBitcodeWriter/filter-alias.ll
@@ -0,0 +1,16 @@
+; RUN: opt -thinlto-bc -o %t %s
+; RUN: llvm-modextract -n 0 -o - %t | llvm-dis | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-modextract -n 1 -o - %t | llvm-dis | FileCheck --check-prefix=CHECK1 %s
+; CHECK0: @al = external global i8*
+; CHECK1: @al = unnamed_addr alias i8*,
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.0.24215"
+
+$al = comdat any
+
+@anon = private unnamed_addr constant { [1 x i8*] } { [1 x i8*] [i8* null] }, comdat($al), !type !0
+
+@al = external unnamed_addr alias i8*, getelementptr inbounds ({ [1 x i8*] }, { [1 x i8*] }* @anon, i32 0, i32 0, i32 1)
+
+!0 = !{i64 8, !"?AVA@@"}
diff --git a/test/Transforms/ThinLTOBitcodeWriter/no-type-md.ll b/test/Transforms/ThinLTOBitcodeWriter/no-type-md.ll
index f1ada67abe50..753e07a326b7 100644
--- a/test/Transforms/ThinLTOBitcodeWriter/no-type-md.ll
+++ b/test/Transforms/ThinLTOBitcodeWriter/no-type-md.ll
@@ -1,6 +1,30 @@
-; RUN: opt -thinlto-bc -o %t %s
-; RUN: llvm-dis -o - %t | FileCheck %s
-; RUN: llvm-bcanalyzer -dump %t | FileCheck --check-prefix=BCA %s
+; Generate bitcode files with summary, as well as minimized bitcode without
+; the debug metadata for the thin link.
+; RUN: opt -thinlto-bc -thin-link-bitcode-file=%t.thinlink.bc -o %t.bc %s
+; RUN: llvm-dis -o - %t.bc | FileCheck %s
+; RUN: llvm-dis -o - %t.thinlink.bc | FileCheck --check-prefix=NODEBUG %s
+; RUN: llvm-bcanalyzer -dump %t.bc | FileCheck --check-prefix=BCA %s
+
+; Make sure the combined index files produced by both the normal and the
+; thin link bitcode files are identical
+; RUN: llvm-lto -thinlto -o %t3 %t.bc
+; Copy the minimized bitcode to the regular bitcode path so the module
+; paths in the index are the same (save and restore the regular bitcode
+; for use again further down).
+; RUN: mv %t.bc %t.bc.sv
+; RUN: cp %t.thinlink.bc %t.bc
+; RUN: llvm-lto -thinlto -o %t4 %t.bc
+; RUN: mv %t.bc.sv %t.bc
+; RUN: diff %t3.thinlto.bc %t4.thinlto.bc
+
+; Try again using -thinlto-action to produce combined index
+; RUN: rm -f %t3.thinlto.bc %t4.thinlto.bc
+; RUN: llvm-lto -thinlto-action=thinlink -o %t3.thinlto.bc %t.bc
+; Copy the minimized bitcode to the regular bitcode path so the module
+; paths in the index are the same.
+; RUN: cp %t.thinlink.bc %t.bc
+; RUN: llvm-lto -thinlto-action=thinlink -o %t4.thinlto.bc %t.bc
+; RUN: diff %t3.thinlto.bc %t4.thinlto.bc
 
 ; BCA: <GLOBALVAL_SUMMARY_BLOCK
 
@@ -11,3 +35,10 @@
 define void @f() {
   ret void
 }
+
+; CHECK: !llvm.dbg.cu
+; NODEBUG-NOT: !llvm.dbg.cu
+!llvm.dbg.cu = !{}
+
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+!llvm.module.flags = !{!1}
diff --git a/test/Transforms/ThinLTOBitcodeWriter/split-vfunc-internal.ll b/test/Transforms/ThinLTOBitcodeWriter/split-vfunc-internal.ll
new file mode 100644
index 000000000000..087796b5031c
--- /dev/null
+++ b/test/Transforms/ThinLTOBitcodeWriter/split-vfunc-internal.ll
@@ -0,0 +1,21 @@
+; RUN: opt -thinlto-bc -o %t %s
+; RUN: llvm-modextract -b -n 0 -o - %t | llvm-dis | FileCheck --check-prefix=M0 %s
+; RUN: llvm-modextract -b -n 1 -o - %t | llvm-dis | FileCheck --check-prefix=M1 %s
+
+define [1 x i8*]* @source() {
+  ret [1 x i8*]* @g
+}
+
+; M0: @"g$84f59439b469192440047efc8de357fb" = external hidden constant [1 x i8*]{{$}}
+; M1: @"g$84f59439b469192440047efc8de357fb" = hidden constant [1 x i8*] [i8* bitcast (i64 (i8*)* @"ok$84f59439b469192440047efc8de357fb" to i8*)]
+@g = internal constant [1 x i8*] [
+  i8* bitcast (i64 (i8*)* @ok to i8*)
+], !type !0
+
+; M0: define hidden i64 @"ok$84f59439b469192440047efc8de357fb"
+; M1: define available_externally hidden i64 @"ok$84f59439b469192440047efc8de357fb"
+define internal i64 @ok(i8* %this) {
+  ret i64 42
+}
+
+!0 = !{i32 0, !"typeid"}
diff --git a/test/Transforms/ThinLTOBitcodeWriter/split-vfunc.ll b/test/Transforms/ThinLTOBitcodeWriter/split-vfunc.ll
new file mode 100644
index 000000000000..0793459af414
--- /dev/null
+++ b/test/Transforms/ThinLTOBitcodeWriter/split-vfunc.ll
@@ -0,0 +1,75 @@
+; RUN: opt -thinlto-bc -o %t %s
+; RUN: llvm-modextract -b -n 0 -o - %t | llvm-dis | FileCheck --check-prefix=M0 %s
+; RUN: llvm-modextract -b -n 1 -o - %t | llvm-dis | FileCheck --check-prefix=M1 %s
+
+; M0: @g = external constant [9 x i8*]{{$}}
+; M1: @g = constant [9 x i8*]
+@g = constant [9 x i8*] [
+  i8* bitcast (i64 (i8*)* @ok1 to i8*),
+  i8* bitcast (i64 (i8*, i64)* @ok2 to i8*),
+  i8* bitcast (void (i8*)* @wrongtype1 to i8*),
+  i8* bitcast (i128 (i8*)* @wrongtype2 to i8*),
+  i8* bitcast (i64 ()* @wrongtype3 to i8*),
+  i8* bitcast (i64 (i8*, i8*)* @wrongtype4 to i8*),
+  i8* bitcast (i64 (i8*, i128)* @wrongtype5 to i8*),
+  i8* bitcast (i64 (i8*)* @usesthis to i8*),
+  i8* bitcast (i8 (i8*)* @reads to i8*)
+], !type !0
+
+; M0: define i64 @ok1
+; M1: define available_externally i64 @ok1
+define i64 @ok1(i8* %this) {
+  ret i64 42
+}
+
+; M0: define i64 @ok2
+; M1: define available_externally i64 @ok2
+define i64 @ok2(i8* %this, i64 %arg) {
+  ret i64 %arg
+}
+
+; M0: define void @wrongtype1
+; M1: declare void @wrongtype1()
+define void @wrongtype1(i8*) {
+  ret void
+}
+
+; M0: define i128 @wrongtype2
+; M1: declare void @wrongtype2()
+define i128 @wrongtype2(i8*) {
+  ret i128 0
+}
+
+; M0: define i64 @wrongtype3
+; M1: declare void @wrongtype3()
+define i64 @wrongtype3() {
+  ret i64 0
+}
+
+; M0: define i64 @wrongtype4
+; M1: declare void @wrongtype4()
+define i64 @wrongtype4(i8*, i8*) {
+  ret i64 0
+}
+
+; M0: define i64 @wrongtype5
+; M1: declare void @wrongtype5()
+define i64 @wrongtype5(i8*, i128) {
+  ret i64 0
+}
+
+; M0: define i64 @usesthis
+; M1: declare void @usesthis()
+define i64 @usesthis(i8* %this) {
+  %i = ptrtoint i8* %this to i64
+  ret i64 %i
+}
+
+; M0: define i8 @reads
+; M1: declare void @reads()
+define i8 @reads(i8* %this) {
+  %l = load i8, i8* %this
+  ret i8 %l
+}
+
+!0 = !{i32 0, !"typeid"}
diff --git a/test/Transforms/ThinLTOBitcodeWriter/split.ll b/test/Transforms/ThinLTOBitcodeWriter/split.ll
index e08e92328b51..d37d10bd3560 100644
--- a/test/Transforms/ThinLTOBitcodeWriter/split.ll
+++ b/test/Transforms/ThinLTOBitcodeWriter/split.ll
@@ -1,20 +1,37 @@
-; RUN: opt -thinlto-bc -o %t %s
-; RUN: llvm-modextract -b -n 0 -o %t0 %t
-; RUN: llvm-modextract -b -n 1 -o %t1 %t
+; Generate bitcode files with summary, as well as minimized bitcode without
+; the debug metadata for the thin link.
+; RUN: opt -thinlto-bc -thin-link-bitcode-file=%t2 -o %t %s
+; RUN: llvm-modextract -b -n 0 -o %t0.bc %t
+; RUN: llvm-modextract -b -n 1 -o %t1.bc %t
+; RUN: llvm-modextract -b -n 0 -o %t0.thinlink.bc %t2
+; RUN: llvm-modextract -b -n 1 -o %t1.thinlink.bc %t2
 ; RUN: not llvm-modextract -b -n 2 -o - %t 2>&1 | FileCheck --check-prefix=ERROR %s
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=M0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=M1 %s
-; RUN: llvm-bcanalyzer -dump %t0 | FileCheck --check-prefix=BCA0 %s
-; RUN: llvm-bcanalyzer -dump %t1 | FileCheck --check-prefix=BCA1 %s
+; RUN: llvm-dis -o - %t0.bc | FileCheck --check-prefix=M0 %s
+; RUN: llvm-dis -o - %t1.bc | FileCheck --check-prefix=M1 %s
+; RUN: llvm-dis -o - %t0.thinlink.bc | FileCheck --check-prefix=NODEBUG %s
+; RUN: llvm-dis -o - %t1.thinlink.bc | FileCheck --check-prefix=NODEBUG %s
+; RUN: llvm-bcanalyzer -dump %t0.bc | FileCheck --check-prefix=BCA0 %s
+; RUN: llvm-bcanalyzer -dump %t1.bc | FileCheck --check-prefix=BCA1 %s
+
+; Make sure the combined index files produced by both the normal and the
+; thin link bitcode files are identical
+; RUN: llvm-lto -thinlto -o %t3 %t0.bc
+; Copy the minimized bitcode to the regular bitcode path so the module
+; paths in the index are the same.
+; RUN: cp %t0.thinlink.bc %t0.bc
+; RUN: llvm-lto -thinlto -o %t4 %t0.bc
+; RUN: diff %t3.thinlto.bc %t4.thinlto.bc
 
 ; ERROR: llvm-modextract: error: module index out of range; bitcode file contains 2 module(s)
 
 ; BCA0: <GLOBALVAL_SUMMARY_BLOCK
 ; BCA1-NOT: <GLOBALVAL_SUMMARY_BLOCK
 
+$g = comdat any
+
 ; M0: @g = external global i8{{$}}
-; M1: @g = global i8 42, !type !0
-@g = global i8 42, !type !0
+; M1: @g = global i8 42, comdat, !type !0
+@g = global i8 42, comdat, !type !0
 
 ; M0: define i8* @f()
 ; M1-NOT: @f()
@@ -24,3 +41,11 @@ define i8* @f() {
 
 ; M1: !0 = !{i32 0, !"typeid"}
 !0 = !{i32 0, !"typeid"}
+
+; M0: !llvm.dbg.cu
+; M1-NOT: !llvm.dbg.cu
+; NODEBUG-NOT: !llvm.dbg.cu
+!llvm.dbg.cu = !{}
+
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+!llvm.module.flags = !{!1}
diff --git a/test/Transforms/ThinLTOBitcodeWriter/unsplittable.ll b/test/Transforms/ThinLTOBitcodeWriter/unsplittable.ll
index fbc97a000971..718013e39b3e 100644
--- a/test/Transforms/ThinLTOBitcodeWriter/unsplittable.ll
+++ b/test/Transforms/ThinLTOBitcodeWriter/unsplittable.ll
@@ -1,6 +1,9 @@
-; RUN: opt -thinlto-bc -o %t %s
+; RUN: opt -thinlto-bc -thin-link-bitcode-file=%t2 -o %t %s
 ; RUN: llvm-dis -o - %t | FileCheck %s
 ; RUN: llvm-bcanalyzer -dump %t | FileCheck --check-prefix=BCA %s
+; When not splitting the module, the thin link bitcode file should simply be a
+; copy of the regular module.
+; RUN: diff %t %t2
 
 ; BCA-NOT: <GLOBALVAL_SUMMARY_BLOCK
 
diff --git a/test/Transforms/Util/MemorySSA/assume.ll b/test/Transforms/Util/MemorySSA/assume.ll
deleted file mode 100644
index d771c78eb1cf..000000000000
--- a/test/Transforms/Util/MemorySSA/assume.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: opt -basicaa -memoryssa -analyze < %s 2>&1 | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='print<memoryssa>,verify<memoryssa>' -disable-output < %s 2>&1 | FileCheck %s
-;
-; Ensures that assumes are treated as not reading or writing memory.
-
-declare void @llvm.assume(i1)
-
-define i32 @foo(i32* %a, i32* %b, i1 %c) {
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: store i32 4
-  store i32 4, i32* %a, align 4
-; CHECK-NOT: MemoryDef
-; CHECK: call void @llvm.assume
-  call void @llvm.assume(i1 %c)
-; CHECK: MemoryUse(1)
-; CHECK-NEXT: %1 = load i32
-  %1 = load i32, i32* %a, align 4
-  ret i32 %1
-}
diff --git a/test/Transforms/Util/MemorySSA/atomic-clobber.ll b/test/Transforms/Util/MemorySSA/atomic-clobber.ll
deleted file mode 100644
index acd819a89351..000000000000
--- a/test/Transforms/Util/MemorySSA/atomic-clobber.ll
+++ /dev/null
@@ -1,119 +0,0 @@
-; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='print<memoryssa>,verify<memoryssa>' -disable-output < %s 2>&1 | FileCheck %s
-;
-; Ensures that atomic loads count as MemoryDefs
-
-; CHECK-LABEL: define i32 @foo
-define i32 @foo(i32* %a, i32* %b) {
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: store i32 4
-  store i32 4, i32* %a, align 4
-; CHECK: 2 = MemoryDef(1)
-; CHECK-NEXT: %1 = load atomic i32
-  %1 = load atomic i32, i32* %b acquire, align 4
-; CHECK: MemoryUse(2)
-; CHECK-NEXT: %2 = load i32
-  %2 = load i32, i32* %a, align 4
-  %3 = add i32 %1, %2
-  ret i32 %3
-}
-
-; CHECK-LABEL: define void @bar
-define void @bar(i32* %a) {
-; CHECK: MemoryUse(liveOnEntry)
-; CHECK-NEXT: load atomic i32, i32* %a unordered, align 4
-  load atomic i32, i32* %a unordered, align 4
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: load atomic i32, i32* %a monotonic, align 4
-  load atomic i32, i32* %a monotonic, align 4
-; CHECK: 2 = MemoryDef(1)
-; CHECK-NEXT: load atomic i32, i32* %a acquire, align 4
-  load atomic i32, i32* %a acquire, align 4
-; CHECK: 3 = MemoryDef(2)
-; CHECK-NEXT: load atomic i32, i32* %a seq_cst, align 4
-  load atomic i32, i32* %a seq_cst, align 4
-  ret void
-}
-
-; CHECK-LABEL: define void @baz
-define void @baz(i32* %a) {
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: %1 = load atomic i32
-  %1 = load atomic i32, i32* %a acquire, align 4
-; CHECK: MemoryUse(1)
-; CHECK-NEXT: %2 = load atomic i32, i32* %a unordered, align 4
-  %2 = load atomic i32, i32* %a unordered, align 4
-; CHECK: 2 = MemoryDef(1)
-; CHECK-NEXT: %3 = load atomic i32, i32* %a monotonic, align 4
-  %3 = load atomic i32, i32* %a monotonic, align 4
-  ret void
-}
-
-; CHECK-LABEL: define void @fences
-define void @fences(i32* %a) {
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: fence acquire
-  fence acquire
-; CHECK: MemoryUse(1)
-; CHECK-NEXT: %1 = load i32, i32* %a
-  %1 = load i32, i32* %a
-
-; CHECK: 2 = MemoryDef(1)
-; CHECK-NEXT: fence release
-  fence release
-; CHECK: MemoryUse(2)
-; CHECK-NEXT: %2 = load i32, i32* %a
-  %2 = load i32, i32* %a
-
-; CHECK: 3 = MemoryDef(2)
-; CHECK-NEXT: fence acq_rel
-  fence acq_rel
-; CHECK: MemoryUse(3)
-; CHECK-NEXT: %3 = load i32, i32* %a
-  %3 = load i32, i32* %a
-
-; CHECK: 4 = MemoryDef(3)
-; CHECK-NEXT: fence seq_cst
-  fence seq_cst
-; CHECK: MemoryUse(4)
-; CHECK-NEXT: %4 = load i32, i32* %a
-  %4 = load i32, i32* %a
-  ret void
-}
-
-; CHECK-LABEL: define void @seq_cst_clobber
-define void @seq_cst_clobber(i32* noalias %a, i32* noalias %b) {
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: %1 = load atomic i32, i32* %a monotonic, align 4
-  load atomic i32, i32* %a monotonic, align 4
-
-; CHECK: 2 = MemoryDef(1)
-; CHECK-NEXT: %2 = load atomic i32, i32* %a seq_cst, align 4
-  load atomic i32, i32* %a seq_cst, align 4
-
-; CHECK: 3 = MemoryDef(2)
-; CHECK-NEXT: load atomic i32, i32* %a monotonic, align 4
-  load atomic i32, i32* %a monotonic, align 4
-
-  ret void
-}
-
-; Ensure that AA hands us MRI_Mod on unreorderable atomic ops.
-;
-; This test is a bit implementation-specific. In particular, it depends on that
-; we pass cmpxchg-load queries to AA, without trying to reason about them on
-; our own.
-;
-; If AA gets more aggressive, we can find another way.
-;
-; CHECK-LABEL: define void @check_aa_is_sane
-define void @check_aa_is_sane(i32* noalias %a, i32* noalias %b) {
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: cmpxchg i32* %a, i32 0, i32 1 acquire acquire
-  cmpxchg i32* %a, i32 0, i32 1 acquire acquire
-; CHECK: MemoryUse(1)
-; CHECK-NEXT: load i32, i32* %b, align 4
-  load i32, i32* %b, align 4
-
-  ret void
-}
diff --git a/test/Transforms/Util/MemorySSA/basicaa-memcpy.ll b/test/Transforms/Util/MemorySSA/basicaa-memcpy.ll
deleted file mode 100644
index bfd7c899b59a..000000000000
--- a/test/Transforms/Util/MemorySSA/basicaa-memcpy.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: opt -disable-output -basicaa -print-memoryssa %s 2>&1 | FileCheck %s
-
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
-
-define void @source_clobber(i8* %a, i8* %b) {
-; CHECK-LABEL: @source_clobber(
-; CHECK-NEXT:  ; 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 128, i32 1, i1 false)
-; CHECK-NEXT:  ; MemoryUse(liveOnEntry)
-; CHECK-NEXT:    [[X:%.*]] = load i8, i8* %b
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 128, i32 1, i1 false)
-  %x = load i8, i8* %b
-  ret void
-}
diff --git a/test/Transforms/Util/MemorySSA/constant-memory.ll b/test/Transforms/Util/MemorySSA/constant-memory.ll
deleted file mode 100644
index bc970e72fc4d..000000000000
--- a/test/Transforms/Util/MemorySSA/constant-memory.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s
-;
-; Things that BasicAA can prove points to constant memory should be
-; liveOnEntry, as well.
-
-declare void @clobberAllTheThings()
-
-@str = private unnamed_addr constant [2 x i8] c"hi"
-
-define i8 @foo() {
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: call void @clobberAllTheThings()
-  call void @clobberAllTheThings()
-  %1 = getelementptr [2 x i8], [2 x i8]* @str, i64 0, i64 0
-; CHECK: MemoryUse(liveOnEntry)
-; CHECK-NEXT: %2 = load i8
-  %2 = load i8, i8* %1, align 1
-  %3 = getelementptr [2 x i8], [2 x i8]* @str, i64 0, i64 1
-; CHECK: MemoryUse(liveOnEntry)
-; CHECK-NEXT: %4 = load i8
-  %4 = load i8, i8* %3, align 1
-  %5 = add i8 %2, %4
-  ret i8 %5
-}
-
-define i8 @select(i1 %b) {
-  %1 = alloca i8, align 1
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: store i8 0
-  store i8 0, i8* %1, align 1
-
-; CHECK: 2 = MemoryDef(1)
-; CHECK-NEXT: call void @clobberAllTheThings()
-  call void @clobberAllTheThings()
-  %2 = getelementptr [2 x i8], [2 x i8]* @str, i64 0, i64 0
-  %3 = select i1 %b, i8* %2, i8* %1
-; CHECK: MemoryUse(2)
-; CHECK-NEXT: %4 = load i8
-  %4 = load i8, i8* %3, align 1
-  ret i8 %4
-}
diff --git a/test/Transforms/Util/MemorySSA/cyclicphi.ll b/test/Transforms/Util/MemorySSA/cyclicphi.ll
deleted file mode 100644
index 6e91db959e4c..000000000000
--- a/test/Transforms/Util/MemorySSA/cyclicphi.ll
+++ /dev/null
@@ -1,123 +0,0 @@
-; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='print<memoryssa>,verify<memoryssa>' -disable-output < %s 2>&1 | FileCheck %s
-
-%struct.hoge = type { i32, %struct.widget }
-%struct.widget = type { i64 }
-
-define hidden void @quux(%struct.hoge *%f) align 2 {
-  %tmp = getelementptr inbounds %struct.hoge, %struct.hoge* %f, i64 0, i32 1, i32 0
-  %tmp24 = getelementptr inbounds %struct.hoge, %struct.hoge* %f, i64 0, i32 1
-  %tmp25 = bitcast %struct.widget* %tmp24 to i64**
-  br label %bb26
-
-bb26:                                             ; preds = %bb77, %0
-; CHECK:  2 = MemoryPhi({%0,liveOnEntry},{bb77,3})
-; CHECK-NEXT:   br i1 undef, label %bb68, label %bb77
-  br i1 undef, label %bb68, label %bb77
-
-bb68:                                             ; preds = %bb26
-; CHECK:  MemoryUse(liveOnEntry)
-; CHECK-NEXT:   %tmp69 = load i64, i64* null, align 8
-  %tmp69 = load i64, i64* null, align 8
-; CHECK:  1 = MemoryDef(2)
-; CHECK-NEXT:   store i64 %tmp69, i64* %tmp, align 8
-  store i64 %tmp69, i64* %tmp, align 8
-  br label %bb77
-
-bb77:                                             ; preds = %bb68, %bb26
-; CHECK:  3 = MemoryPhi({bb26,2},{bb68,1})
-; CHECK:  MemoryUse(3)
-; CHECK-NEXT:   %tmp78 = load i64*, i64** %tmp25, align 8
-  %tmp78 = load i64*, i64** %tmp25, align 8
-  %tmp79 = getelementptr inbounds i64, i64* %tmp78, i64 undef
-  br label %bb26
-}
-
-; CHECK-LABEL: define void @quux_skip
-define void @quux_skip(%struct.hoge* noalias %f, i64* noalias %g) align 2 {
-  %tmp = getelementptr inbounds %struct.hoge, %struct.hoge* %f, i64 0, i32 1, i32 0
-  %tmp24 = getelementptr inbounds %struct.hoge, %struct.hoge* %f, i64 0, i32 1
-  %tmp25 = bitcast %struct.widget* %tmp24 to i64**
-  br label %bb26
-
-bb26:                                             ; preds = %bb77, %0
-; CHECK: 2 = MemoryPhi({%0,liveOnEntry},{bb77,3})
-; CHECK-NEXT: br i1 undef, label %bb68, label %bb77
-  br i1 undef, label %bb68, label %bb77
-
-bb68:                                             ; preds = %bb26
-; CHECK: MemoryUse(2)
-; CHECK-NEXT: %tmp69 = load i64, i64* %g, align 8
-  %tmp69 = load i64, i64* %g, align 8
-; CHECK: 1 = MemoryDef(2)
-; CHECK-NEXT: store i64 %tmp69, i64* %g, align 8
-  store i64 %tmp69, i64* %g, align 8
-  br label %bb77
-
-bb77:                                             ; preds = %bb68, %bb26
-; CHECK: 3 = MemoryPhi({bb26,2},{bb68,1})
-; CHECK: MemoryUse(liveOnEntry)
-; CHECK-NEXT: %tmp78 = load i64*, i64** %tmp25, align 8
-  %tmp78 = load i64*, i64** %tmp25, align 8
-  br label %bb26
-}
-
-; CHECK-LABEL: define void @quux_dominated
-define void @quux_dominated(%struct.hoge* noalias %f, i64* noalias %g) align 2 {
-  %tmp = getelementptr inbounds %struct.hoge, %struct.hoge* %f, i64 0, i32 1, i32 0
-  %tmp24 = getelementptr inbounds %struct.hoge, %struct.hoge* %f, i64 0, i32 1
-  %tmp25 = bitcast %struct.widget* %tmp24 to i64**
-  br label %bb26
-
-bb26:                                             ; preds = %bb77, %0
-; CHECK: 3 = MemoryPhi({%0,liveOnEntry},{bb77,2})
-; CHECK: MemoryUse(3)
-; CHECK-NEXT: load i64*, i64** %tmp25, align 8
-  load i64*, i64** %tmp25, align 8
-  br i1 undef, label %bb68, label %bb77
-
-bb68:                                             ; preds = %bb26
-; CHECK: MemoryUse(3)
-; CHECK-NEXT: %tmp69 = load i64, i64* %g, align 8
-  %tmp69 = load i64, i64* %g, align 8
-; CHECK: 1 = MemoryDef(3)
-; CHECK-NEXT: store i64 %tmp69, i64* %g, align 8
-  store i64 %tmp69, i64* %g, align 8
-  br label %bb77
-
-bb77:                                             ; preds = %bb68, %bb26
-; CHECK: 4 = MemoryPhi({bb26,3},{bb68,1})
-; CHECK: 2 = MemoryDef(4)
-; CHECK-NEXT: store i64* null, i64** %tmp25, align 8
-  store i64* null, i64** %tmp25, align 8
-  br label %bb26
-}
-
-; CHECK-LABEL: define void @quux_nodominate
-define void @quux_nodominate(%struct.hoge* noalias %f, i64* noalias %g) align 2 {
-  %tmp = getelementptr inbounds %struct.hoge, %struct.hoge* %f, i64 0, i32 1, i32 0
-  %tmp24 = getelementptr inbounds %struct.hoge, %struct.hoge* %f, i64 0, i32 1
-  %tmp25 = bitcast %struct.widget* %tmp24 to i64**
-  br label %bb26
-
-bb26:                                             ; preds = %bb77, %0
-; CHECK: 2 = MemoryPhi({%0,liveOnEntry},{bb77,3})
-; CHECK: MemoryUse(liveOnEntry)
-; CHECK-NEXT: load i64*, i64** %tmp25, align 8
-  load i64*, i64** %tmp25, align 8
-  br i1 undef, label %bb68, label %bb77
-
-bb68:                                             ; preds = %bb26
-; CHECK: MemoryUse(2)
-; CHECK-NEXT: %tmp69 = load i64, i64* %g, align 8
-  %tmp69 = load i64, i64* %g, align 8
-; CHECK: 1 = MemoryDef(2)
-; CHECK-NEXT: store i64 %tmp69, i64* %g, align 8
-  store i64 %tmp69, i64* %g, align 8
-  br label %bb77
-
-bb77:                                             ; preds = %bb68, %bb26
-; CHECK: 3 = MemoryPhi({bb26,2},{bb68,1})
-; CHECK-NEXT: br label %bb26
-  br label %bb26
-}
diff --git a/test/Transforms/Util/MemorySSA/forward-unreachable.ll b/test/Transforms/Util/MemorySSA/forward-unreachable.ll
deleted file mode 100644
index 2bbf399daae4..000000000000
--- a/test/Transforms/Util/MemorySSA/forward-unreachable.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: opt -aa-pipeline=basic-aa -passes='print<memoryssa>,verify<memoryssa>' -disable-output < %s 2>&1 | FileCheck %s
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-
-define void @test() {
-entry:
-  br i1 undef, label %split1, label %split2
-
-split1:
-  store i16 undef, i16* undef, align 2
- br label %merge
-split2:
- br label %merge
-forwardunreachable:
-  br label %merge
-merge:
-; The forwardunreachable block still needs an entry in the phi node,
-; because it is reverse reachable, so the CFG still has it as a
-; predecessor of the block
-; CHECK:  3 = MemoryPhi({split1,1},{split2,liveOnEntry},{forwardunreachable,liveOnEntry})
-  store i16 undef, i16* undef, align 2
-  ret void
-}
-
diff --git a/test/Transforms/Util/MemorySSA/function-clobber.ll b/test/Transforms/Util/MemorySSA/function-clobber.ll
deleted file mode 100644
index a01893a5b954..000000000000
--- a/test/Transforms/Util/MemorySSA/function-clobber.ll
+++ /dev/null
@@ -1,54 +0,0 @@
-; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='print<memoryssa>,verify<memoryssa>' -disable-output < %s 2>&1 | FileCheck %s
-;
-; Ensuring that external functions without attributes are MemoryDefs
-
-@g = external global i32
-declare void @modifyG()
-
-define i32 @foo() {
-; CHECK: MemoryUse(liveOnEntry)
-; CHECK-NEXT: %1 = load i32
-  %1 = load i32, i32* @g
-
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: store i32 4
-  store i32 4, i32* @g, align 4
-
-; CHECK: 2 = MemoryDef(1)
-; CHECK-NEXT: call void @modifyG()
-  call void @modifyG()
-
-; CHECK: MemoryUse(2)
-; CHECK-NEXT: %2 = load i32
-  %2 = load i32, i32* @g
-  %3 = add i32 %2, %1
-  ret i32 %3
-}
-
-declare void @readEverything() readonly
-declare void @clobberEverything()
-
-; CHECK-LABEL: define void @bar
-define void @bar() {
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: call void @clobberEverything()
-  call void @clobberEverything()
-  br i1 undef, label %if.end, label %if.then
-
-if.then:
-; CHECK: MemoryUse(1)
-; CHECK-NEXT: call void @readEverything()
-  call void @readEverything()
-; CHECK: 2 = MemoryDef(1)
-; CHECK-NEXT: call void @clobberEverything()
-  call void @clobberEverything()
-  br label %if.end
-
-if.end:
-; CHECK: 3 = MemoryPhi({%0,1},{if.then,2})
-; CHECK: MemoryUse(3)
-; CHECK-NEXT: call void @readEverything()
-  call void @readEverything()
-  ret void
-}
diff --git a/test/Transforms/Util/MemorySSA/function-mem-attrs.ll b/test/Transforms/Util/MemorySSA/function-mem-attrs.ll
deleted file mode 100644
index 11383771a413..000000000000
--- a/test/Transforms/Util/MemorySSA/function-mem-attrs.ll
+++ /dev/null
@@ -1,59 +0,0 @@
-; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='print<memoryssa>,verify<memoryssa>' -disable-output < %s 2>&1 | FileCheck %s
-;
-; Test that various function attributes give us sane results.
-
-@g = external global i32
-
-declare void @readonlyFunction() readonly
-declare void @noattrsFunction()
-
-define void @readonlyAttr() {
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: store i32 0
-  store i32 0, i32* @g, align 4
-
-  %1 = alloca i32, align 4
-; CHECK: 2 = MemoryDef(1)
-; CHECK-NEXT: store i32 0
-  store i32 0, i32* %1, align 4
-
-; CHECK: MemoryUse(1)
-; CHECK-NEXT: call void @readonlyFunction()
-  call void @readonlyFunction()
-
-; CHECK: MemoryUse(1)
-; CHECK-NEXT: call void @noattrsFunction() #
-; Assume that #N is readonly
-  call void @noattrsFunction() readonly
-
-  ; Sanity check that noattrsFunction is otherwise a MemoryDef
-; CHECK: 3 = MemoryDef(2)
-; CHECK-NEXT: call void @noattrsFunction()
-  call void @noattrsFunction()
-  ret void
-}
-
-declare void @argMemOnly(i32*) argmemonly
-
-define void @inaccessableOnlyAttr() {
-  %1 = alloca i32, align 4
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: store i32 0
-  store i32 0, i32* %1, align 4
-
-; CHECK: 2 = MemoryDef(1)
-; CHECK-NEXT: store i32 0
-  store i32 0, i32* @g, align 4
-
-; CHECK: MemoryUse(1)
-; CHECK-NEXT: call void @argMemOnly(i32* %1) #
-; Assume that #N is readonly
-  call void @argMemOnly(i32* %1) readonly
-
-; CHECK: 3 = MemoryDef(2)
-; CHECK-NEXT: call void @argMemOnly(i32* %1)
-  call void @argMemOnly(i32* %1)
-
-  ret void
-}
diff --git a/test/Transforms/Util/MemorySSA/invariant-groups.ll b/test/Transforms/Util/MemorySSA/invariant-groups.ll
deleted file mode 100644
index 57247fe3b2b5..000000000000
--- a/test/Transforms/Util/MemorySSA/invariant-groups.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s
-;
-; Currently, MemorySSA doesn't support invariant groups. So, we should ignore
-; invariant.group.barrier intrinsics entirely. We'll need to pay attention to
-; them when/if we decide to support invariant groups.
-
-@g = external global i32
-
-define i32 @foo(i32* %a) {
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: store i32 0
-  store i32 0, i32* %a, align 4, !llvm.invariant.group !0
-
-; CHECK: 2 = MemoryDef(1)
-; CHECK-NEXT: store i32 1
-  store i32 1, i32* @g, align 4
-
-  %1 = bitcast i32* %a to i8*
-  %a8 = call i8* @llvm.invariant.group.barrier(i8* %1)
-  %a32 = bitcast i8* %a8 to i32*
-
-; CHECK: MemoryUse(2)
-; CHECK-NEXT: %2 = load i32
-  %2 = load i32, i32* %a32, align 4, !llvm.invariant.group !0
-  ret i32 %2
-}
-
-declare i8* @llvm.invariant.group.barrier(i8*)
-
-!0 = !{!"group1"}
diff --git a/test/Transforms/Util/MemorySSA/lifetime-simple.ll b/test/Transforms/Util/MemorySSA/lifetime-simple.ll
deleted file mode 100644
index cdb36e31eb96..000000000000
--- a/test/Transforms/Util/MemorySSA/lifetime-simple.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='print<memoryssa>,verify<memoryssa>' -disable-output < %s 2>&1 | FileCheck %s
-; This test checks a number of things:
-; First, the lifetime markers should not clobber any uses of Q or P.
-; Second, the loads of P are MemoryUse(LiveOnEntry) due to the placement of the markers vs the loads.
-
-define i8 @test(i8* %P, i8* %Q) {
-entry:
-; CHECK:  1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT:   call void @llvm.lifetime.start(i64 32, i8* %P)
-  call void @llvm.lifetime.start(i64 32, i8* %P)
-; CHECK:  MemoryUse(liveOnEntry)
-; CHECK-NEXT:   %0 = load i8, i8* %P
-  %0 = load i8, i8* %P
-; CHECK:  2 = MemoryDef(1)
-; CHECK-NEXT:   store i8 1, i8* %P
-  store i8 1, i8* %P
-; CHECK:  3 = MemoryDef(2)
-; CHECK-NEXT:   call void @llvm.lifetime.end(i64 32, i8* %P)
-  call void @llvm.lifetime.end(i64 32, i8* %P)
-; CHECK:  MemoryUse(liveOnEntry)
-; CHECK-NEXT:   %1 = load i8, i8* %P
-  %1 = load i8, i8* %P
-; CHECK:  MemoryUse(2)
-; CHECK-NEXT:   %2 = load i8, i8* %Q
-  %2 = load i8, i8* %Q
-  ret i8 %1
-}
-declare void @llvm.lifetime.start(i64 %S, i8* nocapture %P) readonly
-declare void @llvm.lifetime.end(i64 %S, i8* nocapture %P)
diff --git a/test/Transforms/Util/MemorySSA/load-invariant.ll b/test/Transforms/Util/MemorySSA/load-invariant.ll
deleted file mode 100644
index 3c55db11597b..000000000000
--- a/test/Transforms/Util/MemorySSA/load-invariant.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='print<memoryssa>' -verify-memoryssa -disable-output < %s 2>&1 | FileCheck %s
-;
-; Invariant loads should be considered live on entry, because, once the
-; location is known to be dereferenceable, the value can never change.
-;
-; Currently XFAILed because this optimization was held back from the initial
-; commit.
-
-@g = external global i32
-
-declare void @clobberAllTheThings()
-
-; CHECK-LABEL: define i32 @foo
-define i32 @foo() {
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: call void @clobberAllTheThings()
-  call void @clobberAllTheThings()
-; CHECK: MemoryUse(liveOnEntry)
-; CHECK-NEXT: %1 = load i32
-  %1 = load i32, i32* @g, align 4, !invariant.load !0
-  ret i32 %1
-}
-
-; CHECK-LABEL: define i32 @bar
-define i32 @bar(i32* %a) {
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: call void @clobberAllTheThings()
-  call void @clobberAllTheThings()
-
-; CHECK: 2 = MemoryDef(1)
-; CHECK-NEXT: %1 = load atomic i32
-  %1 = load atomic i32, i32* %a acquire, align 4, !invariant.load !0
-
-; CHECK: MemoryUse(2)
-; CHECK-NEXT: %2 = load i32
-  %2 = load i32, i32* %a, align 4
-  ret i32 %2
-}
-
-!0 = !{}
diff --git a/test/Transforms/Util/MemorySSA/many-dom-backedge.ll b/test/Transforms/Util/MemorySSA/many-dom-backedge.ll
deleted file mode 100644
index c2216a47bb20..000000000000
--- a/test/Transforms/Util/MemorySSA/many-dom-backedge.ll
+++ /dev/null
@@ -1,77 +0,0 @@
-; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='print<memoryssa>,verify<memoryssa>' -disable-output < %s 2>&1 | FileCheck %s
-;
-; many-dom.ll, with an added back-edge back into the switch.
-; Because people love their gotos.
-
-declare i1 @getBool() readnone
-
-define i32 @foo(i32* %p) {
-entry:
-  br label %loopbegin
-
-loopbegin:
-; CHECK: 8 = MemoryPhi({entry,liveOnEntry},{sw.epilog,6})
-; CHECK-NEXT: %n =
-  %n = phi i32 [ 0, %entry ], [ %1, %sw.epilog ]
-  %m = alloca i32, align 4
-  switch i32 %n, label %sw.default [
-    i32 0, label %sw.bb
-    i32 1, label %sw.bb1
-    i32 2, label %sw.bb2
-    i32 3, label %sw.bb3
-  ]
-
-sw.bb:
-; CHECK: 1 = MemoryDef(8)
-; CHECK-NEXT: store i32 1
-  store i32 1, i32* %m, align 4
-  br label %sw.epilog
-
-sw.bb1:
-; CHECK: 2 = MemoryDef(8)
-; CHECK-NEXT: store i32 2
-  store i32 2, i32* %m, align 4
-  br label %sw.epilog
-
-sw.bb2:
-; CHECK: 3 = MemoryDef(8)
-; CHECK-NEXT: store i32 3
-  store i32 3, i32* %m, align 4
-  br label %sw.epilog
-
-sw.bb3:
-; CHECK: 9 = MemoryPhi({loopbegin,8},{sw.almostexit,6})
-; CHECK: 4 = MemoryDef(9)
-; CHECK-NEXT: store i32 4
-  store i32 4, i32* %m, align 4
-  br label %sw.epilog
-
-sw.default:
-; CHECK: 5 = MemoryDef(8)
-; CHECK-NEXT: store i32 5
-  store i32 5, i32* %m, align 4
-  br label %sw.epilog
-
-sw.epilog:
-; CHECK: 10 = MemoryPhi({sw.default,5},{sw.bb3,4},{sw.bb,1},{sw.bb1,2},{sw.bb2,3})
-; CHECK-NEXT: MemoryUse(10)
-; CHECK-NEXT: %0 =
-  %0 = load i32, i32* %m, align 4
-; CHECK: 6 = MemoryDef(10)
-; CHECK-NEXT: %1 =
-  %1 = load volatile i32, i32* %p, align 4
-  %2 = icmp eq i32 %0, %1
-  br i1 %2, label %sw.almostexit, label %loopbegin
-
-sw.almostexit:
-  %3 = icmp eq i32 0, %1
-  br i1 %3, label %exit, label %sw.bb3
-
-exit:
-; CHECK: 7 = MemoryDef(6)
-; CHECK-NEXT: %4 = load volatile i32
-  %4 = load volatile i32, i32* %p, align 4
-  %5 = add i32 %4, %1
-  ret i32 %5
-}
diff --git a/test/Transforms/Util/MemorySSA/many-doms.ll b/test/Transforms/Util/MemorySSA/many-doms.ll
deleted file mode 100644
index 1f57cbf1c4df..000000000000
--- a/test/Transforms/Util/MemorySSA/many-doms.ll
+++ /dev/null
@@ -1,67 +0,0 @@
-; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='print<memoryssa>,verify<memoryssa>' -disable-output < %s 2>&1 | FileCheck %s
-;
-; Testing many dominators, specifically from a switch statement in C.
-
-declare i1 @getBool() readnone
-
-define i32 @foo(i32* %p) {
-entry:
-  br label %loopbegin
-
-loopbegin:
-; CHECK: 7 = MemoryPhi({entry,liveOnEntry},{sw.epilog,6})
-; CHECK-NEXT: %n =
-  %n = phi i32 [ 0, %entry ], [ %1, %sw.epilog ]
-  %m = alloca i32, align 4
-  switch i32 %n, label %sw.default [
-    i32 0, label %sw.bb
-    i32 1, label %sw.bb1
-    i32 2, label %sw.bb2
-    i32 3, label %sw.bb3
-  ]
-
-sw.bb:
-; CHECK: 1 = MemoryDef(7)
-; CHECK-NEXT: store i32 1
-  store i32 1, i32* %m, align 4
-  br label %sw.epilog
-
-sw.bb1:
-; CHECK: 2 = MemoryDef(7)
-; CHECK-NEXT: store i32 2
-  store i32 2, i32* %m, align 4
-  br label %sw.epilog
-
-sw.bb2:
-; CHECK: 3 = MemoryDef(7)
-; CHECK-NEXT: store i32 3
-  store i32 3, i32* %m, align 4
-  br label %sw.epilog
-
-sw.bb3:
-; CHECK: 4 = MemoryDef(7)
-; CHECK-NEXT: store i32 4
-  store i32 4, i32* %m, align 4
-  br label %sw.epilog
-
-sw.default:
-; CHECK: 5 = MemoryDef(7)
-; CHECK-NEXT: store i32 5
-  store i32 5, i32* %m, align 4
-  br label %sw.epilog
-
-sw.epilog:
-; CHECK: 8 = MemoryPhi({sw.default,5},{sw.bb,1},{sw.bb1,2},{sw.bb2,3},{sw.bb3,4})
-; CHECK-NEXT: MemoryUse(8)
-; CHECK-NEXT: %0 =
-  %0 = load i32, i32* %m, align 4
-; CHECK: 6 = MemoryDef(8)
-; CHECK-NEXT: %1 =
-  %1 = load volatile i32, i32* %p, align 4
-  %2 = icmp eq i32 %0, %1
-  br i1 %2, label %exit, label %loopbegin
-
-exit:
-  ret i32 %1
-}
diff --git a/test/Transforms/Util/MemorySSA/multi-edges.ll b/test/Transforms/Util/MemorySSA/multi-edges.ll
deleted file mode 100644
index 5d47728d6f5a..000000000000
--- a/test/Transforms/Util/MemorySSA/multi-edges.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='print<memoryssa>,verify<memoryssa>' -disable-output < %s 2>&1 | FileCheck %s
-;
-; Makes sure we have a sane model if both successors of some block is the same
-; block.
-
-define i32 @foo(i1 %a) {
-entry:
-  %0 = alloca i32, align 4
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: store i32 4
-  store i32 4, i32* %0
-  br i1 %a, label %Loop.Body, label %Loop.End
-
-Loop.Body:
-; CHECK: 3 = MemoryPhi({entry,1},{Loop.End,4})
-; CHECK-NEXT: 2 = MemoryDef(3)
-; CHECK-NEXT: store i32 5
-  store i32 5, i32* %0, align 4
-  br i1 %a, label %Loop.End, label %Loop.End ; WhyDoWeEvenHaveThatLever.gif
-
-Loop.End:
-; CHECK: 4 = MemoryPhi({entry,1},{Loop.Body,2},{Loop.Body,2})
-; CHECK-NEXT: MemoryUse(4)
-; CHECK-NEXT: %1 = load
-  %1 = load i32, i32* %0, align 4
-  %2 = icmp eq i32 5, %1
-  br i1 %2, label %Ret, label %Loop.Body
-
-Ret:
-  ret i32 %1
-}
diff --git a/test/Transforms/Util/MemorySSA/multiple-backedges-hal.ll b/test/Transforms/Util/MemorySSA/multiple-backedges-hal.ll
deleted file mode 100644
index 005a37c9add2..000000000000
--- a/test/Transforms/Util/MemorySSA/multiple-backedges-hal.ll
+++ /dev/null
@@ -1,73 +0,0 @@
-; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='print<memoryssa>,verify<memoryssa>' -disable-output < %s 2>&1 | FileCheck %s
-
-; hfinkel's case
-; [entry]
-;  |
-; .....
-; (clobbering access - b)
-;  |
-; ....  ________________________________
-;  \   /                               |
-;   (x)                                |
-;  ......                              |
-;    |                                 |
-;    |    ______________________       |
-;     \   /                    |       |
-; (starting access)            |       |
-;     ...                      |       |
-; (clobbering access - a)      |       |
-;    ...                       |       |
-;    | |                       |       |
-;    | |_______________________|       |
-;    |                                 |
-;    |_________________________________|
-;
-; More specifically, one access, with multiple clobbering accesses. One of
-; which strictly dominates the access, the other of which has a backedge
-
-; readnone so we don't have a 1:1 mapping of MemorySSA edges to Instructions.
-declare void @doThingWithoutReading() readnone
-declare i8 @getValue() readnone
-declare i1 @getBool() readnone
-
-define hidden void @testcase(i8* %Arg) {
-Entry:
-  call void @doThingWithoutReading()
-  %Val.Entry = call i8 @getValue()
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: store i8 %Val.Entry
-  store i8 %Val.Entry, i8* %Arg
-  call void @doThingWithoutReading()
-  br label %OuterLoop
-
-OuterLoop:
-; CHECK: 4 = MemoryPhi({Entry,1},{InnerLoop.Tail,3})
-; CHECK-NEXT: %Val.Outer =
-  %Val.Outer = call i8 @getValue()
-; CHECK: 2 = MemoryDef(4)
-; CHECK-NEXT: store i8 %Val.Outer
-  store i8 %Val.Outer, i8* %Arg
-  call void @doThingWithoutReading()
-  br label %InnerLoop
-
-InnerLoop:
-; CHECK: 5 = MemoryPhi({OuterLoop,2},{InnerLoop,3})
-; CHECK-NEXT: ; MemoryUse(5)
-; CHECK-NEXT: %StartingAccess = load
-  %StartingAccess = load i8, i8* %Arg, align 4
-  %Val.Inner = call i8 @getValue()
-; CHECK: 3 = MemoryDef(5)
-; CHECK-NEXT: store i8 %Val.Inner
-  store i8 %Val.Inner, i8* %Arg
-  call void @doThingWithoutReading()
-  %KeepGoing = call i1 @getBool()
-  br i1 %KeepGoing, label %InnerLoop.Tail, label %InnerLoop
-
-InnerLoop.Tail:
-  %KeepGoing.Tail = call i1 @getBool()
-  br i1 %KeepGoing.Tail, label %End, label %OuterLoop
-
-End:
-  ret void
-}
diff --git a/test/Transforms/Util/MemorySSA/multiple-locations.ll b/test/Transforms/Util/MemorySSA/multiple-locations.ll
deleted file mode 100644
index 9a3e87e4ab6d..000000000000
--- a/test/Transforms/Util/MemorySSA/multiple-locations.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='print<memoryssa>,verify<memoryssa>' -disable-output < %s 2>&1 | FileCheck %s
-;
-; Checks that basicAA is doing some amount of disambiguation for us
-
-define i32 @foo(i1 %cond) {
-  %a = alloca i32, align 4
-  %b = alloca i32, align 4
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: store i32 0
-  store i32 0, i32* %a, align 4
-; CHECK: 2 = MemoryDef(1)
-; CHECK-NEXT: store i32 1
-  store i32 1, i32* %b, align 4
-
-; CHECK: MemoryUse(1)
-; CHECK-NEXT: %1 = load i32
-  %1 = load i32, i32* %a, align 4
-; CHECK: MemoryUse(2)
-; CHECK-NEXT: %2 = load i32
-  %2 = load i32, i32* %b, align 4
-
-  %3 = add i32 %1, %2
-  ret i32 %3
-}
diff --git a/test/Transforms/Util/MemorySSA/no-disconnected.ll b/test/Transforms/Util/MemorySSA/no-disconnected.ll
deleted file mode 100644
index d1dcb15893ad..000000000000
--- a/test/Transforms/Util/MemorySSA/no-disconnected.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='print<memoryssa>,verify<memoryssa>' -disable-output < %s 2>&1 | FileCheck %s
-;
-; This test ensures we don't end up with multiple reaching defs for a single
-; use/phi edge If we were to optimize defs, we would end up with 2=
-; MemoryDef(liveOnEntry) and 4 = MemoryDef(liveOnEntry) Both would mean both
-; 1,2, and 3,4 would reach the phi node.  Because the phi node can only have one
-; entry on each edge, it would choose 2, 4 and disconnect 1 and 3 completely
-; from the SSA graph, even though they are not dead
-
-define void @sink_store(i32 %index, i32* %foo, i32* %bar) {
-entry:
-  %cmp = trunc i32 %index to i1
-  br i1 %cmp, label %if.then, label %if.else
-
-if.then:                                          ; preds = %entry
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT:   store i32 %index, i32* %foo, align 4
-  store i32 %index, i32* %foo, align 4
-; CHECK: 2 = MemoryDef(1)
-; CHECK-NEXT:   store i32 %index, i32* %bar, align 4
-  store i32 %index, i32* %bar, align 4
-  br label %if.end
-
-if.else:                                          ; preds = %entry
-; CHECK: 3 = MemoryDef(liveOnEntry)
-; CHECK-NEXT:   store i32 %index, i32* %foo, align 4
-  store i32 %index, i32* %foo, align 4
-; CHECK: 4 = MemoryDef(3)
-; CHECK-NEXT:   store i32 %index, i32* %bar, align 4
-  store i32 %index, i32* %bar, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-; CHECK: 5 = MemoryPhi({if.then,2},{if.else,4})
-; CHECK: MemoryUse(5)
-; CHECK-NEXT:   %c = load i32, i32* %foo
-  %c = load i32, i32* %foo
-; CHECK: MemoryUse(5)
-; CHECK-NEXT:   %d = load i32, i32* %bar
-  %d = load i32, i32* %bar
-  ret void
-}
diff --git a/test/Transforms/Util/MemorySSA/optimize-use.ll b/test/Transforms/Util/MemorySSA/optimize-use.ll
deleted file mode 100644
index 8a8f2dd50959..000000000000
--- a/test/Transforms/Util/MemorySSA/optimize-use.ll
+++ /dev/null
@@ -1,37 +0,0 @@
-; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='print<memoryssa>,verify<memoryssa>' -disable-output < %s 2>&1 | FileCheck %s
-
-; Function Attrs: ssp uwtable
-define i32 @main() {
-entry:
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT:   %call = call noalias i8* @_Znwm(i64 4)
-  %call = call noalias i8* @_Znwm(i64 4)
-  %0 = bitcast i8* %call to i32*
-; CHECK: 2 = MemoryDef(1)
-; CHECK-NEXT:   %call1 = call noalias i8* @_Znwm(i64 4)
-  %call1 = call noalias i8* @_Znwm(i64 4)
-  %1 = bitcast i8* %call1 to i32*
-; CHECK: 3 = MemoryDef(2)
-; CHECK-NEXT:   store i32 5, i32* %0, align 4
-  store i32 5, i32* %0, align 4
-; CHECK: 4 = MemoryDef(3)
-; CHECK-NEXT:   store i32 7, i32* %1, align 4
-  store i32 7, i32* %1, align 4
-; CHECK: MemoryUse(3)
-; CHECK-NEXT:   %2 = load i32, i32* %0, align 4
-  %2 = load i32, i32* %0, align 4
-; CHECK: MemoryUse(4)
-; CHECK-NEXT:   %3 = load i32, i32* %1, align 4
-  %3 = load i32, i32* %1, align 4
-; CHECK: MemoryUse(3)
-; CHECK-NEXT:   %4 = load i32, i32* %0, align 4
-  %4 = load i32, i32* %0, align 4
-; CHECK: MemoryUse(4)
-; CHECK-NEXT:   %5 = load i32, i32* %1, align 4
-  %5 = load i32, i32* %1, align 4
-  %add = add nsw i32 %3, %5
-  ret i32 %add
-}
-
-declare noalias i8* @_Znwm(i64)
diff --git a/test/Transforms/Util/MemorySSA/phi-translation.ll b/test/Transforms/Util/MemorySSA/phi-translation.ll
deleted file mode 100644
index c91faf2ac20b..000000000000
--- a/test/Transforms/Util/MemorySSA/phi-translation.ll
+++ /dev/null
@@ -1,181 +0,0 @@
-; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='print<memoryssa>,verify<memoryssa>' -disable-output < %s 2>&1 | FileCheck %s
-
-; %ptr can't alias %local, so we should be able to optimize the use of %local to
-; point to the store to %local.
-; CHECK-LABEL: define void @check
-define void @check(i8* %ptr, i1 %bool) {
-entry:
-  %local = alloca i8, align 1
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: store i8 0, i8* %local, align 1
-  store i8 0, i8* %local, align 1
-  br i1 %bool, label %if.then, label %if.end
-
-if.then:
-  %p2 = getelementptr inbounds i8, i8* %ptr, i32 1
-; CHECK: 2 = MemoryDef(1)
-; CHECK-NEXT: store i8 0, i8* %p2, align 1
-  store i8 0, i8* %p2, align 1
-  br label %if.end
-
-if.end:
-; CHECK: 3 = MemoryPhi({entry,1},{if.then,2})
-; CHECK: MemoryUse(1)
-; CHECK-NEXT: load i8, i8* %local, align 1
-  load i8, i8* %local, align 1
-  ret void
-}
-
-; CHECK-LABEL: define void @check2
-define void @check2(i1 %val1, i1 %val2, i1 %val3) {
-entry:
-  %local = alloca i8, align 1
-  %local2 = alloca i8, align 1
-
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: store i8 0, i8* %local
-  store i8 0, i8* %local
-  br i1 %val1, label %if.then, label %phi.3
-
-if.then:
-; CHECK: 2 = MemoryDef(1)
-; CHECK-NEXT: store i8 2, i8* %local2
-  store i8 2, i8* %local2
-  br i1 %val2, label %phi.2, label %phi.3
-
-phi.3:
-; CHECK: 5 = MemoryPhi({entry,1},{if.then,2})
-; CHECK: 3 = MemoryDef(5)
-; CHECK-NEXT: store i8 3, i8* %local2
-  store i8 3, i8* %local2
-  br i1 %val3, label %phi.2, label %phi.1
-
-phi.2:
-; CHECK: 6 = MemoryPhi({if.then,2},{phi.3,3})
-; CHECK: 4 = MemoryDef(6)
-; CHECK-NEXT: store i8 4, i8* %local2
-  store i8 4, i8* %local2
-  br label %phi.1
-
-phi.1:
-; Order matters here; phi.2 needs to come before phi.3, because that's the order
-; they're visited in.
-; CHECK: 7 = MemoryPhi({phi.2,4},{phi.3,3})
-; CHECK: MemoryUse(1)
-; CHECK-NEXT: load i8, i8* %local
-  load i8, i8* %local
-  ret void
-}
-
-; CHECK-LABEL: define void @cross_phi
-define void @cross_phi(i8* noalias %p1, i8* noalias %p2) {
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: store i8 0, i8* %p1
-  store i8 0, i8* %p1
-; CHECK: MemoryUse(1)
-; CHECK-NEXT: load i8, i8* %p1
-  load i8, i8* %p1
-  br i1 undef, label %a, label %b
-
-a:
-; CHECK: 2 = MemoryDef(1)
-; CHECK-NEXT: store i8 0, i8* %p2
-  store i8 0, i8* %p2
-  br i1 undef, label %c, label %d
-
-b:
-; CHECK: 3 = MemoryDef(1)
-; CHECK-NEXT: store i8 1, i8* %p2
-  store i8 1, i8* %p2
-  br i1 undef, label %c, label %d
-
-c:
-; CHECK: 6 = MemoryPhi({a,2},{b,3})
-; CHECK: 4 = MemoryDef(6)
-; CHECK-NEXT: store i8 2, i8* %p2
-  store i8 2, i8* %p2
-  br label %e
-
-d:
-; CHECK: 7 = MemoryPhi({a,2},{b,3})
-; CHECK: 5 = MemoryDef(7)
-; CHECK-NEXT: store i8 3, i8* %p2
-  store i8 3, i8* %p2
-  br label %e
-
-e:
-; 8 = MemoryPhi({c,4},{d,5})
-; CHECK: MemoryUse(1)
-; CHECK-NEXT: load i8, i8* %p1
-  load i8, i8* %p1
-  ret void
-}
-
-; CHECK-LABEL: define void @looped
-define void @looped(i8* noalias %p1, i8* noalias %p2) {
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: store i8 0, i8* %p1
-  store i8 0, i8* %p1
-  br label %loop.1
-
-loop.1:
-; CHECK: 5 = MemoryPhi({%0,1},{loop.3,4})
-; CHECK: 2 = MemoryDef(5)
-; CHECK-NEXT: store i8 0, i8* %p2
-  store i8 0, i8* %p2
-  br i1 undef, label %loop.2, label %loop.3
-
-loop.2:
-; CHECK: 6 = MemoryPhi({loop.1,2},{loop.3,4})
-; CHECK: 3 = MemoryDef(6)
-; CHECK-NEXT: store i8 1, i8* %p2
-  store i8 1, i8* %p2
-  br label %loop.3
-
-loop.3:
-; CHECK: 7 = MemoryPhi({loop.1,2},{loop.2,3})
-; CHECK: 4 = MemoryDef(7)
-; CHECK-NEXT: store i8 2, i8* %p2
-  store i8 2, i8* %p2
-; CHECK: MemoryUse(1)
-; CHECK-NEXT: load i8, i8* %p1
-  load i8, i8* %p1
-  br i1 undef, label %loop.2, label %loop.1
-}
-
-; CHECK-LABEL: define void @looped_visitedonlyonce
-define void @looped_visitedonlyonce(i8* noalias %p1, i8* noalias %p2) {
-  br label %while.cond
-
-while.cond:
-; CHECK: 4 = MemoryPhi({%0,liveOnEntry},{if.end,3})
-; CHECK-NEXT: br i1 undef, label %if.then, label %if.end
-  br i1 undef, label %if.then, label %if.end
-
-if.then:
-; CHECK: 1 = MemoryDef(4)
-; CHECK-NEXT: store i8 0, i8* %p1
-  store i8 0, i8* %p1
-  br i1 undef, label %if.end, label %if.then2
-
-if.then2:
-; CHECK: 2 = MemoryDef(1)
-; CHECK-NEXT: store i8 1, i8* %p2
-  store i8 1, i8* %p2
-  br label %if.end
-
-if.end:
-; CHECK: 5 = MemoryPhi({while.cond,4},{if.then,1},{if.then2,2})
-; CHECK: MemoryUse(5)
-; CHECK-NEXT: load i8, i8* %p1
-  load i8, i8* %p1
-; CHECK: 3 = MemoryDef(5)
-; CHECK-NEXT: store i8 2, i8* %p2
-  store i8 2, i8* %p2
-; CHECK: MemoryUse(5)
-; CHECK-NEXT: load i8, i8* %p1
-  load i8, i8* %p1
-  br label %while.cond
-}
-
diff --git a/test/Transforms/Util/MemorySSA/pr28880.ll b/test/Transforms/Util/MemorySSA/pr28880.ll
deleted file mode 100644
index ae64c0c5d73e..000000000000
--- a/test/Transforms/Util/MemorySSA/pr28880.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='print<memoryssa>,verify<memoryssa>' -disable-output < %s 2>&1 | FileCheck %s
-
-; This testcase is reduced from SingleSource/Benchmarks/Misc/fbench.c
-; It is testing to make sure that the MemorySSA use optimizer
-; comes up with right answers when dealing with multiple MemoryLocations
-; over different blocks. See PR28880 for more details.
-@global = external hidden unnamed_addr global double, align 8
-@global.1 = external hidden unnamed_addr global double, align 8
-
-; Function Attrs: nounwind ssp uwtable
-define hidden fastcc void @hoge() unnamed_addr #0 {
-bb:
-  br i1 undef, label %bb1, label %bb2
-
-bb1:                                              ; preds = %bb
-; These accesses should not conflict.
-; CHECK:  1 = MemoryDef(liveOnEntry)
-; 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT:   store double undef, double* @global, align 8
-  store double undef, double* @global, align 8
-; CHECK:  MemoryUse(liveOnEntry)
-; MemoryUse(liveOnEntry)
-; CHECK-NEXT:   %tmp = load double, double* @global.1, align 8
-  %tmp = load double, double* @global.1, align 8
-  unreachable
-
-bb2:                                              ; preds = %bb
-  br label %bb3
-
-bb3:                                              ; preds = %bb2
-  br i1 undef, label %bb4, label %bb6
-
-bb4:                                              ; preds = %bb3
-; These accesses should conflict.
-; CHECK:  2 = MemoryDef(liveOnEntry)
-; 2 = MemoryDef(liveOnEntry)
-; CHECK-NEXT:   store double 0.000000e+00, double* @global.1, align 8
-  store double 0.000000e+00, double* @global.1, align 8
-; CHECK:  MemoryUse(2)
-; MemoryUse(2)
-; CHECK-NEXT:   %tmp5 = load double, double* @global.1, align 8
-  %tmp5 = load double, double* @global.1, align 8
-  unreachable
-
-bb6:                                              ; preds = %bb3
-  unreachable
-}
-
-attributes #0 = { nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
diff --git a/test/Transforms/Util/MemorySSA/volatile-clobber.ll b/test/Transforms/Util/MemorySSA/volatile-clobber.ll
deleted file mode 100644
index d6f960f3e382..000000000000
--- a/test/Transforms/Util/MemorySSA/volatile-clobber.ll
+++ /dev/null
@@ -1,94 +0,0 @@
-; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='print<memoryssa>,verify<memoryssa>' -disable-output < %s 2>&1 | FileCheck %s
-;
-; Ensures that volatile stores/loads count as MemoryDefs
-
-; CHECK-LABEL: define i32 @foo
-define i32 @foo() {
-  %1 = alloca i32, align 4
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: store volatile i32 4
-  store volatile i32 4, i32* %1, align 4
-; CHECK: 2 = MemoryDef(1)
-; CHECK-NEXT: store volatile i32 8
-  store volatile i32 8, i32* %1, align 4
-; CHECK: 3 = MemoryDef(2)
-; CHECK-NEXT: %2 = load volatile i32
-  %2 = load volatile i32, i32* %1, align 4
-; CHECK: 4 = MemoryDef(3)
-; CHECK-NEXT: %3 = load volatile i32
-  %3 = load volatile i32, i32* %1, align 4
-  %4 = add i32 %3, %2
-  ret i32 %4
-}
-
-; Ensuring that we don't automatically hoist nonvolatile loads around volatile
-; loads
-; CHECK-LABEL define void @volatile_only
-define void @volatile_only(i32* %arg1, i32* %arg2) {
-  ; Trivially NoAlias/MustAlias
-  %a = alloca i32
-  %b = alloca i32
-
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: load volatile i32, i32* %a
-  load volatile i32, i32* %a
-; CHECK: MemoryUse(liveOnEntry)
-; CHECK-NEXT: load i32, i32* %b
-  load i32, i32* %b
-; CHECK: MemoryUse(1)
-; CHECK-NEXT: load i32, i32* %a
-  load i32, i32* %a
-
-  ; MayAlias
-; CHECK: 2 = MemoryDef(1)
-; CHECK-NEXT: load volatile i32, i32* %arg1
-  load volatile i32, i32* %arg1
-; CHECK: MemoryUse(2)
-; CHECK-NEXT: load i32, i32* %arg2
-  load i32, i32* %arg2
-
-  ret void
-}
-
-; Ensuring that volatile atomic operations work properly.
-; CHECK-LABEL define void @volatile_atomics
-define void @volatile_atomics(i32* %arg1, i32* %arg2) {
-  %a = alloca i32
-  %b = alloca i32
-
- ; Trivially NoAlias/MustAlias
-
-; CHECK: 1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT: load atomic volatile i32, i32* %a acquire, align 4
-  load atomic volatile i32, i32* %a acquire, align 4
-; CHECK: MemoryUse(1)
-; CHECK-NEXT: load i32, i32* %b
-  load i32, i32* %b
-
-; CHECK: 2 = MemoryDef(1)
-; CHECK-NEXT: load atomic volatile i32, i32* %a monotonic, align 4
-  load atomic volatile i32, i32* %a monotonic, align 4
-; CHECK: MemoryUse(1)
-; CHECK-NEXT: load i32, i32* %b
-  load i32, i32* %b
-; CHECK: MemoryUse(1)
-; CHECK-NEXT: load atomic i32, i32* %b unordered, align 4
-  load atomic i32, i32* %b unordered, align 4
-; CHECK: MemoryUse(2)
-; CHECK-NEXT: load atomic i32, i32* %a unordered, align 4
-  load atomic i32, i32* %a unordered, align 4
-; CHECK: MemoryUse(2)
-; CHECK-NEXT: load i32, i32* %a
-  load i32, i32* %a
-
-  ; MayAlias
-; CHECK: 3 = MemoryDef(2)
-; CHECK-NEXT: load atomic volatile i32, i32* %arg1 monotonic, align 4
-  load atomic volatile i32, i32* %arg1 monotonic, align 4
-; CHECK: MemoryUse(3)
-; CHECK-NEXT: load i32, i32* %arg2
-  load i32, i32* %arg2
-
-  ret void
-}
diff --git a/test/Transforms/Util/PredicateInfo/condprop.ll b/test/Transforms/Util/PredicateInfo/condprop.ll
new file mode 100644
index 000000000000..79c76baa6f61
--- /dev/null
+++ b/test/Transforms/Util/PredicateInfo/condprop.ll
@@ -0,0 +1,471 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -print-predicateinfo -analyze  < %s 2>&1 | FileCheck %s
+
+@a = external global i32		; <i32*> [#uses=7]
+
+define i32 @test1() nounwind {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[TMP1]], label [[BB:%.*]], label [[BB1:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    br label [[BB8:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 5
+; CHECK-NEXT:    br i1 [[TMP3]], label [[BB2:%.*]], label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB8]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 4
+; CHECK-NEXT:    br i1 [[TMP5]], label [[BB4:%.*]], label [[BB5:%.*]]
+; CHECK:       bb4:
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 5
+; CHECK-NEXT:    br label [[BB8]]
+; CHECK:       bb5:
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 5
+; CHECK-NEXT:    br i1 [[TMP9]], label [[BB6:%.*]], label [[BB7:%.*]]
+; CHECK:       bb6:
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], 4
+; CHECK-NEXT:    br label [[BB8]]
+; CHECK:       bb7:
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    br label [[BB8]]
+; CHECK:       bb8:
+; CHECK-NEXT:    [[DOT0:%.*]] = phi i32 [ [[TMP12]], [[BB7]] ], [ [[TMP11]], [[BB6]] ], [ [[TMP7]], [[BB4]] ], [ 4, [[BB2]] ], [ 5, [[BB]] ]
+; CHECK-NEXT:    br label [[RETURN:%.*]]
+; CHECK:       return:
+; CHECK-NEXT:    ret i32 [[DOT0]]
+;
+entry:
+  %0 = load i32, i32* @a, align 4
+  %1 = icmp eq i32 %0, 4
+  br i1 %1, label %bb, label %bb1
+
+bb:		; preds = %entry
+  br label %bb8
+
+bb1:		; preds = %entry
+  %2 = load i32, i32* @a, align 4
+  %3 = icmp eq i32 %2, 5
+  br i1 %3, label %bb2, label %bb3
+
+bb2:		; preds = %bb1
+  br label %bb8
+
+bb3:		; preds = %bb1
+  %4 = load i32, i32* @a, align 4
+  %5 = icmp eq i32 %4, 4
+  br i1 %5, label %bb4, label %bb5
+
+bb4:		; preds = %bb3
+  %6 = load i32, i32* @a, align 4
+  %7 = add i32 %6, 5
+  br label %bb8
+
+bb5:		; preds = %bb3
+  %8 = load i32, i32* @a, align 4
+  %9 = icmp eq i32 %8, 5
+  br i1 %9, label %bb6, label %bb7
+
+bb6:		; preds = %bb5
+  %10 = load i32, i32* @a, align 4
+  %11 = add i32 %10, 4
+  br label %bb8
+
+bb7:		; preds = %bb5
+  %12 = load i32, i32* @a, align 4
+  br label %bb8
+
+bb8:		; preds = %bb7, %bb6, %bb4, %bb2, %bb
+  %.0 = phi i32 [ %12, %bb7 ], [ %11, %bb6 ], [ %7, %bb4 ], [ 4, %bb2 ], [ 5, %bb ]
+  br label %return
+
+return:		; preds = %bb8
+  ret i32 %.0
+}
+
+declare void @foo(i1)
+declare void @bar(i32)
+
+define void @test3(i32 %x, i32 %y) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
+; CHECK-NEXT:    [[Z:%.*]] = and i1 [[XZ]], [[YZ]]
+; CHECK:         [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]])
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]])
+; CHECK:         [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK:         [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]])
+; CHECK-NEXT:    br i1 [[Z]], label [[BOTH_ZERO:%.*]], label [[NOPE:%.*]]
+; CHECK:       both_zero:
+; CHECK-NEXT:    call void @foo(i1 [[XZ_0]])
+; CHECK-NEXT:    call void @foo(i1 [[YZ_0]])
+; CHECK-NEXT:    call void @bar(i32 [[X_0]])
+; CHECK-NEXT:    call void @bar(i32 [[Y_0]])
+; CHECK-NEXT:    ret void
+; CHECK:       nope:
+; CHECK-NEXT:    call void @foo(i1 [[Z_0]])
+; CHECK-NEXT:    ret void
+;
+  %xz = icmp eq i32 %x, 0
+  %yz = icmp eq i32 %y, 0
+  %z = and i1 %xz, %yz
+  br i1 %z, label %both_zero, label %nope
+both_zero:
+  call void @foo(i1 %xz)
+  call void @foo(i1 %yz)
+  call void @bar(i32 %x)
+  call void @bar(i32 %y)
+  ret void
+nope:
+  call void @foo(i1 %z)
+  ret void
+}
+
+define void @test4(i1 %b, i32 %x) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    br i1 [[B:%.*]], label [[SW:%.*]], label [[CASE3:%.*]]
+; CHECK:       sw:
+; CHECK:         i32 0, label [[CASE0:%.*]]
+; CHECK-NEXT:    i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:    i32 2, label [[CASE0]]
+; CHECK-NEXT:    i32 3, label [[CASE3]]
+; CHECK-NEXT:    i32 4, label [[DEFAULT:%.*]]
+; CHECK-NEXT:    ] Edge: [label [[SW]],label %case1] }
+; CHECK-NEXT:    [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X:%.*]])
+; CHECK-NEXT:    switch i32 [[X]], label [[DEFAULT]] [
+; CHECK-NEXT:    i32 0, label [[CASE0]]
+; CHECK-NEXT:    i32 1, label [[CASE1]]
+; CHECK-NEXT:    i32 2, label [[CASE0]]
+; CHECK-NEXT:    i32 3, label [[CASE3]]
+; CHECK-NEXT:    i32 4, label [[DEFAULT]]
+; CHECK-NEXT:    ]
+; CHECK:       default:
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    ret void
+; CHECK:       case0:
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    ret void
+; CHECK:       case1:
+; CHECK-NEXT:    call void @bar(i32 [[X_0]])
+; CHECK-NEXT:    ret void
+; CHECK:       case3:
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    ret void
+;
+  br i1 %b, label %sw, label %case3
+sw:
+  switch i32 %x, label %default [
+  i32 0, label %case0
+  i32 1, label %case1
+  i32 2, label %case0
+  i32 3, label %case3
+  i32 4, label %default
+  ]
+default:
+  call void @bar(i32 %x)
+  ret void
+case0:
+  call void @bar(i32 %x)
+  ret void
+case1:
+  call void @bar(i32 %x)
+  ret void
+case3:
+  call void @bar(i32 %x)
+  ret void
+}
+
+define i1 @test5(i32 %x, i32 %y) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]]
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[X_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK:         [[Y_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[X_0]], [[Y_0]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       different:
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[X_1]], [[Y_1]]
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp eq i32 %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+  %cmp2 = icmp ne i32 %x, %y
+  ret i1 %cmp2
+
+different:
+  %cmp3 = icmp eq i32 %x, %y
+  ret i1 %cmp3
+}
+
+define i1 @test6(i32 %x, i32 %y) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[X]], [[Y]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       different:
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp2 = icmp ne i32 %x, %y
+  %cmp = icmp eq i32 %x, %y
+  %cmp3 = icmp eq i32 %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+  ret i1 %cmp2
+
+different:
+  ret i1 %cmp3
+}
+
+define i1 @test6_fp(float %x, float %y) {
+; CHECK-LABEL: @test6_fp(
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp une float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[X]], [[Y]]
+; CHECK-NEXT:    [[CMP3:%.*]] = fcmp oeq float [[X]], [[Y]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       different:
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp2 = fcmp une float %x, %y
+  %cmp = fcmp oeq float %x, %y
+  %cmp3 = fcmp oeq float  %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+  ret i1 %cmp2
+
+different:
+  ret i1 %cmp3
+}
+
+define i1 @test7(i32 %x, i32 %y) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[X_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK:         [[Y_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[X_0]], [[Y_0]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       different:
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[X_1]], [[Y_1]]
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp sgt i32 %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+  %cmp2 = icmp sle i32 %x, %y
+  ret i1 %cmp2
+
+different:
+  %cmp3 = icmp sgt i32 %x, %y
+  ret i1 %cmp3
+}
+
+define i1 @test7_fp(float %x, float %y) {
+; CHECK-LABEL: @test7_fp(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[X:%.*]], [[Y:%.*]]
+; CHECK:         [[X_0:%.*]] = call float @llvm.ssa.copy.f32(float [[X]])
+; CHECK:         [[X_1:%.*]] = call float @llvm.ssa.copy.f32(float [[X]])
+; CHECK:         [[Y_0:%.*]] = call float @llvm.ssa.copy.f32(float [[Y]])
+; CHECK:         [[Y_1:%.*]] = call float @llvm.ssa.copy.f32(float [[Y]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ule float [[X_0]], [[Y_0]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       different:
+; CHECK-NEXT:    [[CMP3:%.*]] = fcmp ogt float [[X_1]], [[Y_1]]
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = fcmp ogt float %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+  %cmp2 = fcmp ule float %x, %y
+  ret i1 %cmp2
+
+different:
+  %cmp3 = fcmp ogt float %x, %y
+  ret i1 %cmp3
+}
+
+define i1 @test8(i32 %x, i32 %y) {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[X]], [[Y]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       different:
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp2 = icmp sle i32 %x, %y
+  %cmp = icmp sgt i32 %x, %y
+  %cmp3 = icmp sgt i32 %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+  ret i1 %cmp2
+
+different:
+  ret i1 %cmp3
+}
+
+define i1 @test8_fp(float %x, float %y) {
+; CHECK-LABEL: @test8_fp(
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ule float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[X]], [[Y]]
+; CHECK-NEXT:    [[CMP3:%.*]] = fcmp ogt float [[X]], [[Y]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       different:
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp2 = fcmp ule float %x, %y
+  %cmp = fcmp ogt float %x, %y
+  %cmp3 = fcmp ogt float %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+  ret i1 %cmp2
+
+different:
+  ret i1 %cmp3
+}
+
+define i32 @test9(i32 %i, i32 %j) {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I:%.*]], [[J:%.*]]
+; CHECK:         [[I_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[I]])
+; CHECK:         [[J_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[J]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[RET:%.*]]
+; CHECK:       cond_true:
+; CHECK-NEXT:    [[DIFF:%.*]] = sub i32 [[I_0]], [[J_0]]
+; CHECK-NEXT:    ret i32 [[DIFF]]
+; CHECK:       ret:
+; CHECK-NEXT:    ret i32 5
+;
+  %cmp = icmp eq i32 %i, %j
+  br i1 %cmp, label %cond_true, label %ret
+
+cond_true:
+  %diff = sub i32 %i, %j
+  ret i32 %diff
+
+ret:
+  ret i32 5
+}
+
+define i32 @test10(i32 %j, i32 %i) {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I:%.*]], [[J:%.*]]
+; CHECK:         [[I_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[I]])
+; CHECK:         [[J_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[J]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[RET:%.*]]
+; CHECK:       cond_true:
+; CHECK-NEXT:    [[DIFF:%.*]] = sub i32 [[I_0]], [[J_0]]
+; CHECK-NEXT:    ret i32 [[DIFF]]
+; CHECK:       ret:
+; CHECK-NEXT:    ret i32 5
+;
+  %cmp = icmp eq i32 %i, %j
+  br i1 %cmp, label %cond_true, label %ret
+
+cond_true:
+  %diff = sub i32 %i, %j
+  ret i32 %diff
+
+ret:
+  ret i32 5
+}
+
+declare i32 @yogibar()
+
+define i32 @test11(i32 %x) {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[V0:%.*]] = call i32 @yogibar()
+; CHECK-NEXT:    [[V1:%.*]] = call i32 @yogibar()
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[V0]], [[V1]]
+; CHECK:         [[V0_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[V0]])
+; CHECK:         [[V1_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[V1]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[NEXT:%.*]]
+; CHECK:       cond_true:
+; CHECK-NEXT:    ret i32 [[V1_0]]
+; CHECK:       next:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[X:%.*]], [[V0_0]]
+; CHECK:         [[V0_0_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[V0_0]])
+; CHECK-NEXT:    br i1 [[CMP2]], label [[COND_TRUE2:%.*]], label [[NEXT2:%.*]]
+; CHECK:       cond_true2:
+; CHECK-NEXT:    ret i32 [[V0_0_1]]
+; CHECK:       next2:
+; CHECK-NEXT:    ret i32 0
+;
+  %v0 = call i32 @yogibar()
+  %v1 = call i32 @yogibar()
+  %cmp = icmp eq i32 %v0, %v1
+  br i1 %cmp, label %cond_true, label %next
+
+cond_true:
+  ret i32 %v1
+
+next:
+  %cmp2 = icmp eq i32 %x, %v0
+  br i1 %cmp2, label %cond_true2, label %next2
+
+cond_true2:
+  ret i32 %v0
+
+next2:
+  ret i32 0
+}
+
+define i32 @test12(i32 %x) {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[X_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+; CHECK:       cond_true:
+; CHECK-NEXT:    br label [[RET:%.*]]
+; CHECK:       cond_false:
+; CHECK-NEXT:    br label [[RET]]
+; CHECK:       ret:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[X_0]], [[COND_TRUE]] ], [ [[X_1]], [[COND_FALSE]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %cond_true, label %cond_false
+
+cond_true:
+  br label %ret
+
+cond_false:
+  br label %ret
+
+ret:
+  %res = phi i32 [ %x, %cond_true ], [ %x, %cond_false ]
+  ret i32 %res
+}
diff --git a/test/Transforms/Util/PredicateInfo/diamond.ll b/test/Transforms/Util/PredicateInfo/diamond.ll
new file mode 100644
index 000000000000..e3f56d88caf0
--- /dev/null
+++ b/test/Transforms/Util/PredicateInfo/diamond.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -print-predicateinfo < %s 2>&1 | FileCheck %s
+define i1 @f(i32 %x, i1 %y) {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:    br i1 [[Y:%.*]], label [[BB0:%.*]], label [[BB1:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[X:%.*]], 0
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB2:%.*]], label [[BB3:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[X2:%.*]] = add nuw nsw i32 [[X]], 1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i32 [[X2]], 2
+; CHECK:         [[X2_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X2]])
+; CHECK-NEXT:    br i1 [[CMP2]], label [[BB2]], label [[BB3]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[X3:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ [[X2_0]], [[BB1]] ]
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    ret i1 false
+;
+  br i1 %y, label %bb0, label %bb1
+  bb0:
+  %cmp = icmp sge i32 %x, 0  ; x > 0
+  br i1 %cmp, label %bb2, label %bb3
+  bb1:
+  %x2 = add nsw nuw i32 %x, 1
+  %cmp2 = icmp sge i32 %x2, 2     ; x+1 > 2 / x > 1
+  br i1 %cmp2, label %bb2, label %bb3
+  bb2:
+  %x3 = phi i32 [ %x, %bb0 ], [ %x2, %bb1 ]
+  br label %bb3
+  bb3:
+  ret i1 0
+}
+
+define i1 @g(i32 %x, i1 %y) {
+; CHECK-LABEL: @g(
+; CHECK-NEXT:    br i1 [[Y:%.*]], label [[BB0:%.*]], label [[BB1:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[X:%.*]], 0
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB3:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[X2:%.*]] = add nuw nsw i32 [[X]], 1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i32 [[X2]], 2
+; CHECK:         [[X2_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X2]])
+; CHECK-NEXT:    br i1 [[CMP2]], label [[BB3]], label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[X3:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ [[X2_0]], [[BB1]] ]
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    ret i1 false
+;
+  br i1 %y, label %bb0, label %bb1
+  bb0:
+  %cmp = icmp sge i32 %x, 0  ; x > 0
+  br i1 %cmp, label %bb3, label %bb2
+  bb1:
+  %x2 = add nsw nuw i32 %x, 1
+  %cmp2 = icmp sge i32 %x2, 2     ; x+1 > 2 / x > 1
+  br i1 %cmp2, label %bb3, label %bb2
+  bb2:
+  %x3 = phi i32 [ %x, %bb0 ], [ %x2, %bb1 ]
+  br label %bb3
+  bb3:
+  ret i1 0
+}
+
diff --git a/test/Transforms/Util/PredicateInfo/edge.ll b/test/Transforms/Util/PredicateInfo/edge.ll
new file mode 100644
index 000000000000..6c58540e1050
--- /dev/null
+++ b/test/Transforms/Util/PredicateInfo/edge.ll
@@ -0,0 +1,242 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -print-predicateinfo -analyze  < %s 2>&1 | FileCheck %s
+
+define i32 @f1(i32 %x) {
+; CHECK-LABEL: @f1(
+; CHECK-NEXT:  bb0:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB2:%.*]], label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0:%.*]] ], [ 0, [[BB1]] ]
+; CHECK-NEXT:    [[FOO:%.*]] = add i32 [[COND]], [[X]]
+; CHECK-NEXT:    ret i32 [[FOO]]
+;
+bb0:
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %bb2, label %bb1
+bb1:
+  br label %bb2
+bb2:
+  %cond = phi i32 [ %x, %bb0 ], [ 0, %bb1 ]
+  %foo = add i32 %cond, %x
+  ret i32 %foo
+}
+
+define i32 @f2(i32 %x) {
+; CHECK-LABEL: @f2(
+; CHECK-NEXT:  bb0:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[X:%.*]], 0
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0:%.*]] ], [ 0, [[BB1]] ]
+; CHECK-NEXT:    [[FOO:%.*]] = add i32 [[COND]], [[X]]
+; CHECK-NEXT:    ret i32 [[FOO]]
+;
+bb0:
+  %cmp = icmp ne i32 %x, 0
+  br i1 %cmp, label %bb1, label %bb2
+bb1:
+  br label %bb2
+bb2:
+  %cond = phi i32 [ %x, %bb0 ], [ 0, %bb1 ]
+  %foo = add i32 %cond, %x
+  ret i32 %foo
+}
+
+define i32 @f3(i32 %x) {
+; CHECK-LABEL: @f3(
+; CHECK-NEXT:  bb0:
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X:%.*]])
+; CHECK-NEXT:    switch i32 [[X]], label [[BB1:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0:%.*]] ], [ 0, [[BB1]] ]
+; CHECK-NEXT:    [[FOO:%.*]] = add i32 [[COND]], [[X]]
+; CHECK-NEXT:    ret i32 [[FOO]]
+;
+bb0:
+  switch i32 %x, label %bb1 [ i32 0, label %bb2]
+bb1:
+  br label %bb2
+bb2:
+  %cond = phi i32 [ %x, %bb0 ], [ 0, %bb1 ]
+  %foo = add i32 %cond, %x
+  ret i32 %foo
+}
+
+
+define double @fcmp_oeq_not_zero(double %x, double %y) {
+; CHECK-LABEL: @fcmp_oeq_not_zero(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y:%.*]], 2.000000e+00
+; CHECK:         [[Y_0:%.*]] = call double @llvm.ssa.copy.f64(double [[Y]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
+entry:
+  %cmp = fcmp oeq double %y, 2.0
+  br i1 %cmp, label %if, label %return
+
+if:
+  %div = fdiv double %x, %y
+  br label %return
+
+return:
+  %retval = phi double [ %div, %if ], [ %x, %entry ]
+  ret double %retval
+
+}
+
+define double @fcmp_une_not_zero(double %x, double %y) {
+; CHECK-LABEL: @fcmp_une_not_zero(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[Y:%.*]], 2.000000e+00
+; CHECK:         [[Y_0:%.*]] = call double @llvm.ssa.copy.f64(double [[Y]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
+entry:
+  %cmp = fcmp une double %y, 2.0
+  br i1 %cmp, label %return, label %else
+
+else:
+  %div = fdiv double %x, %y
+  br label %return
+
+return:
+  %retval = phi double [ %div, %else ], [ %x, %entry ]
+  ret double %retval
+
+}
+
+define double @fcmp_oeq_zero(double %x, double %y) {
+; CHECK-LABEL: @fcmp_oeq_zero(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y:%.*]], 0.000000e+00
+; CHECK:         [[Y_0:%.*]] = call double @llvm.ssa.copy.f64(double [[Y]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
+entry:
+  %cmp = fcmp oeq double %y, 0.0
+  br i1 %cmp, label %if, label %return
+
+if:
+  %div = fdiv double %x, %y
+  br label %return
+
+return:
+  %retval = phi double [ %div, %if ], [ %x, %entry ]
+  ret double %retval
+
+}
+
+define double @fcmp_une_zero(double %x, double %y) {
+; CHECK-LABEL: @fcmp_une_zero(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[Y:%.*]], -0.000000e+00
+; CHECK:         [[Y_0:%.*]] = call double @llvm.ssa.copy.f64(double [[Y]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
+entry:
+  %cmp = fcmp une double %y, -0.0
+  br i1 %cmp, label %return, label %else
+
+else:
+  %div = fdiv double %x, %y
+  br label %return
+
+return:
+  %retval = phi double [ %div, %else ], [ %x, %entry ]
+  ret double %retval
+
+}
+
+
+define double @fcmp_oeq_maybe_zero(double %x, double %y, double %z1, double %z2) {
+; CHECK-LABEL: @fcmp_oeq_maybe_zero(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[Z:%.*]] = fadd double [[Z1:%.*]], [[Z2:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y:%.*]], [[Z]]
+; CHECK:         [[Z_0:%.*]] = call double @llvm.ssa.copy.f64(double [[Z]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Z_0]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
+entry:
+  %z = fadd double %z1, %z2
+  %cmp = fcmp oeq double %y, %z
+  br i1 %cmp, label %if, label %return
+
+if:
+  %div = fdiv double %x, %z
+  br label %return
+
+return:
+  %retval = phi double [ %div, %if ], [ %x, %entry ]
+  ret double %retval
+
+}
+
+define double @fcmp_une_maybe_zero(double %x, double %y, double %z1, double %z2) {
+; CHECK-LABEL: @fcmp_une_maybe_zero(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[Z:%.*]] = fadd double [[Z1:%.*]], [[Z2:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[Y:%.*]], [[Z]]
+; CHECK:         [[Z_0:%.*]] = call double @llvm.ssa.copy.f64(double [[Z]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Z_0]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
+entry:
+  %z = fadd double %z1, %z2
+  %cmp = fcmp une double %y, %z
+  br i1 %cmp, label %return, label %else
+
+else:
+  %div = fdiv double %x, %z
+  br label %return
+
+return:
+  %retval = phi double [ %div, %else ], [ %x, %entry ]
+  ret double %retval
+
+}
diff --git a/test/Transforms/Util/PredicateInfo/testandor.ll b/test/Transforms/Util/PredicateInfo/testandor.ll
new file mode 100644
index 000000000000..5942ed155318
--- /dev/null
+++ b/test/Transforms/Util/PredicateInfo/testandor.ll
@@ -0,0 +1,211 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -print-predicateinfo < %s 2>&1 | FileCheck %s
+
+declare void @foo(i1)
+declare void @bar(i32)
+declare void @llvm.assume(i1)
+
+define void @testor(i32 %x, i32 %y) {
+; CHECK-LABEL: @testor(
+; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
+; CHECK-NEXT:    [[Z:%.*]] = or i1 [[XZ]], [[YZ]]
+; CHECK:         [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]])
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]])
+; CHECK:         [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK:         [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]])
+; CHECK-NEXT:    br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER:%.*]]
+; CHECK:       oneof:
+; CHECK-NEXT:    call void @foo(i1 [[XZ]])
+; CHECK-NEXT:    call void @foo(i1 [[YZ]])
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    call void @bar(i32 [[Y]])
+; CHECK-NEXT:    ret void
+; CHECK:       neither:
+; CHECK-NEXT:    call void @foo(i1 [[XZ_0]])
+; CHECK-NEXT:    call void @foo(i1 [[YZ_0]])
+; CHECK-NEXT:    call void @bar(i32 [[X_0]])
+; CHECK-NEXT:    call void @bar(i32 [[Y_0]])
+; CHECK-NEXT:    call void @foo(i1 [[Z_0]])
+; CHECK-NEXT:    ret void
+;
+  %xz = icmp eq i32 %x, 0
+  %yz = icmp eq i32 %y, 0
+  %z = or i1 %xz, %yz
+  br i1 %z, label %oneof, label %neither
+oneof:
+;; Should not insert on the true edge for or
+  call void @foo(i1 %xz)
+  call void @foo(i1 %yz)
+  call void @bar(i32 %x)
+  call void @bar(i32 %y)
+  ret void
+neither:
+  call void @foo(i1 %xz)
+  call void @foo(i1 %yz)
+  call void @bar(i32 %x)
+  call void @bar(i32 %y)
+  call void @foo(i1 %z)
+  ret void
+}
+define void @testand(i32 %x, i32 %y) {
+; CHECK-LABEL: @testand(
+; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
+; CHECK-NEXT:    [[Z:%.*]] = and i1 [[XZ]], [[YZ]]
+; CHECK:         [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]])
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]])
+; CHECK:         [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK:         [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]])
+; CHECK-NEXT:    br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK:       both:
+; CHECK-NEXT:    call void @foo(i1 [[XZ_0]])
+; CHECK-NEXT:    call void @foo(i1 [[YZ_0]])
+; CHECK-NEXT:    call void @bar(i32 [[X_0]])
+; CHECK-NEXT:    call void @bar(i32 [[Y_0]])
+; CHECK-NEXT:    ret void
+; CHECK:       nope:
+; CHECK-NEXT:    call void @foo(i1 [[XZ]])
+; CHECK-NEXT:    call void @foo(i1 [[YZ]])
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    call void @bar(i32 [[Y]])
+; CHECK-NEXT:    call void @foo(i1 [[Z_0]])
+; CHECK-NEXT:    ret void
+;
+  %xz = icmp eq i32 %x, 0
+  %yz = icmp eq i32 %y, 0
+  %z = and i1 %xz, %yz
+  br i1 %z, label %both, label %nope
+both:
+  call void @foo(i1 %xz)
+  call void @foo(i1 %yz)
+  call void @bar(i32 %x)
+  call void @bar(i32 %y)
+  ret void
+nope:
+;; Should not insert on the false edge for and
+  call void @foo(i1 %xz)
+  call void @foo(i1 %yz)
+  call void @bar(i32 %x)
+  call void @bar(i32 %y)
+  call void @foo(i1 %z)
+  ret void
+}
+define void @testandsame(i32 %x, i32 %y) {
+; CHECK-LABEL: @testandsame(
+; CHECK-NEXT:    [[XGT:%.*]] = icmp sgt i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[XLT:%.*]] = icmp slt i32 [[X]], 100
+; CHECK-NEXT:    [[Z:%.*]] = and i1 [[XGT]], [[XLT]]
+; CHECK:         [[XGT_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XGT]])
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[X_0_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X_0]])
+; CHECK:         [[XLT_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XLT]])
+; CHECK:         [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]])
+; CHECK-NEXT:    br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK:       both:
+; CHECK-NEXT:    call void @foo(i1 [[XGT_0]])
+; CHECK-NEXT:    call void @foo(i1 [[XLT_0]])
+; CHECK-NEXT:    call void @bar(i32 [[X_0_1]])
+; CHECK-NEXT:    ret void
+; CHECK:       nope:
+; CHECK-NEXT:    call void @foo(i1 [[XGT]])
+; CHECK-NEXT:    call void @foo(i1 [[XLT]])
+; CHECK-NEXT:    call void @foo(i1 [[Z_0]])
+; CHECK-NEXT:    ret void
+;
+  %xgt = icmp sgt i32 %x, 0
+  %xlt = icmp slt i32 %x, 100
+  %z = and i1 %xgt, %xlt
+  br i1 %z, label %both, label %nope
+both:
+  call void @foo(i1 %xgt)
+  call void @foo(i1 %xlt)
+  call void @bar(i32 %x)
+  ret void
+nope:
+  call void @foo(i1 %xgt)
+  call void @foo(i1 %xlt)
+  call void @foo(i1 %z)
+  ret void
+}
+
+define void @testandassume(i32 %x, i32 %y) {
+; CHECK-LABEL: @testandassume(
+; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
+; CHECK-NEXT:    [[Z:%.*]] = and i1 [[XZ]], [[YZ]]
+; CHECK:         [[TMP1:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]])
+; CHECK:         [[TMP2:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[TMP3:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]])
+; CHECK:         [[TMP4:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK:         [[TMP5:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[TMP5]])
+; CHECK:         [[DOT0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP1]])
+; CHECK:         [[DOT01:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[TMP2]])
+; CHECK:         [[DOT02:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP3]])
+; CHECK:         [[DOT03:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[TMP4]])
+; CHECK:         [[DOT04:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP5]])
+; CHECK-NEXT:    br i1 [[TMP5]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK:       both:
+; CHECK-NEXT:    call void @foo(i1 [[DOT0]])
+; CHECK-NEXT:    call void @foo(i1 [[DOT02]])
+; CHECK-NEXT:    call void @bar(i32 [[DOT01]])
+; CHECK-NEXT:    call void @bar(i32 [[DOT03]])
+; CHECK-NEXT:    ret void
+; CHECK:       nope:
+; CHECK-NEXT:    call void @foo(i1 [[DOT04]])
+; CHECK-NEXT:    ret void
+;
+  %xz = icmp eq i32 %x, 0
+  %yz = icmp eq i32 %y, 0
+  %z = and i1 %xz, %yz
+  call void @llvm.assume(i1 %z)
+  br i1 %z, label %both, label %nope
+both:
+  call void @foo(i1 %xz)
+  call void @foo(i1 %yz)
+  call void @bar(i32 %x)
+  call void @bar(i32 %y)
+  ret void
+nope:
+  call void @foo(i1 %z)
+  ret void
+}
+
+;; Unlike and/or for branches, assume is *always* true, so we only match and for it
+define void @testorassume(i32 %x, i32 %y) {
+;
+; CHECK-LABEL: @testorassume(
+; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
+; CHECK-NEXT:    [[Z:%.*]] = or i1 [[XZ]], [[YZ]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[Z]])
+; CHECK:         [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]])
+; CHECK-NEXT:    br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK:       both:
+; CHECK-NEXT:    call void @foo(i1 [[XZ]])
+; CHECK-NEXT:    call void @foo(i1 [[YZ]])
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    call void @bar(i32 [[Y]])
+; CHECK-NEXT:    ret void
+; CHECK:       nope:
+; CHECK-NEXT:    call void @foo(i1 [[Z_0]])
+; CHECK-NEXT:    ret void
+;
+  %xz = icmp eq i32 %x, 0
+  %yz = icmp eq i32 %y, 0
+  %z = or i1 %xz, %yz
+  call void @llvm.assume(i1 %z)
+  br i1 %z, label %both, label %nope
+both:
+  call void @foo(i1 %xz)
+  call void @foo(i1 %yz)
+  call void @bar(i32 %x)
+  call void @bar(i32 %y)
+  ret void
+nope:
+  call void @foo(i1 %z)
+  ret void
+}
diff --git a/test/Transforms/Util/clone-dicompileunit.ll b/test/Transforms/Util/clone-dicompileunit.ll
new file mode 100644
index 000000000000..3f7b5981752d
--- /dev/null
+++ b/test/Transforms/Util/clone-dicompileunit.ll
@@ -0,0 +1,66 @@
+; RUN: opt -run-twice -verify -disable-debug-info-type-map -S -o - %s | FileCheck %s
+
+; Generated using:
+; $ cat p.cpp
+; void sink(void *);
+; class A {
+; public:
+;   template <typename> void m_fn2() { static int a; }
+;   virtual void m_fn1();
+; };
+; void foo() {
+;   class B : public A {
+;   public:
+;     B() { m_fn2<B>(); }
+;   };
+;   sink(new B);
+; }
+; $ clang++ -target x86_64-unknown-linux -fvisibility=hidden -O2 -g2 -flto -S p.cpp -o p.ll
+; # then manually removed function/gv definitions
+
+; Test that when the module is cloned it does not contain a reference to
+; the original DICompileUnit as a result of a collision between the cloned
+; DISubprogram for m_fn2<B> (which refers to the non-ODR entity B via
+; template parameters) and the original DISubprogram.
+
+; CHECK: DICompileUnit
+; CHECK-NOT: DICompileUnit
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux"
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!28, !29}
+!llvm.ident = !{!30}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !3)
+!1 = !DIFile(filename: "p.cpp", directory: "/usr/local/google/home/pcc/b682773-2-repro/small2")
+!2 = !{}
+!3 = !{!4}
+!4 = !DIGlobalVariableExpression(var: !5)
+!5 = distinct !DIGlobalVariable(name: "a", scope: !6, file: !1, line: 5, type: !27, isLocal: true, isDefinition: true)
+!6 = distinct !DISubprogram(name: "m_fn2<B>", linkageName: "_ZN1A5m_fn2IZ3foovE1BEEvv", scope: !7, file: !1, line: 5, type: !8, isLocal: true, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, templateParams: !11, declaration: !23, variables: !24)
+!7 = !DICompositeType(tag: DW_TAG_class_type, name: "A", file: !1, line: 3, flags: DIFlagFwdDecl, identifier: "_ZTS1A")
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!11 = !{!12}
+!12 = !DITemplateTypeParameter(type: !13)
+!13 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "B", scope: !14, file: !1, line: 10, size: 64, elements: !17, vtableHolder: !7)
+!14 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 9, type: !15, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !2)
+!15 = !DISubroutineType(types: !16)
+!16 = !{null}
+!17 = !{!18, !19}
+!18 = !DIDerivedType(tag: DW_TAG_inheritance, scope: !13, baseType: !7, flags: DIFlagPublic)
+!19 = !DISubprogram(name: "B", scope: !13, file: !1, line: 12, type: !20, isLocal: false, isDefinition: false, scopeLine: 12, flags: DIFlagPublic | DIFlagPrototyped, isOptimized: true)
+!20 = !DISubroutineType(types: !21)
+!21 = !{null, !22}
+!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !13, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!23 = !DISubprogram(name: "m_fn2<B>", linkageName: "_ZN1A5m_fn2IZ3foovE1BEEvv", scope: !7, file: !1, line: 5, type: !8, isLocal: false, isDefinition: false, scopeLine: 5, flags: DIFlagPublic | DIFlagPrototyped, isOptimized: true, templateParams: !11)
+!24 = !{!25}
+!25 = !DILocalVariable(name: "this", arg: 1, scope: !6, type: !26, flags: DIFlagArtificial | DIFlagObjectPointer)
+!26 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 64)
+!27 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!28 = !{i32 2, !"Dwarf Version", i32 4}
+!29 = !{i32 2, !"Debug Info Version", i32 3}
+!30 = !{!"clang version 5.0.0 "}
diff --git a/test/Transforms/Util/simplify-dbg-declare-load.ll b/test/Transforms/Util/simplify-dbg-declare-load.ll
index 21d305450860..4ea88fa81e05 100644
--- a/test/Transforms/Util/simplify-dbg-declare-load.ll
+++ b/test/Transforms/Util/simplify-dbg-declare-load.ll
@@ -19,7 +19,7 @@ fail:                                             ; preds = %top
   unreachable
 
 idxend:                                           ; preds = %top
-; CHECK-NOT call void @llvm.dbg.value(metadata %foo* %cp,
+; CHECK-NOT: call void @llvm.dbg.value(metadata %foo* %cp,
   %0 = load volatile %foo, %foo* %cp, align 8
 ; CHECK: call void @llvm.dbg.value(metadata %foo %0,
   store volatile %foo %0, %foo* undef, align 8
diff --git a/test/Transforms/Util/strip-nonlinetable-debuginfo-loops.ll b/test/Transforms/Util/strip-nonlinetable-debuginfo-loops.ll
new file mode 100644
index 000000000000..5f88e31da9fc
--- /dev/null
+++ b/test/Transforms/Util/strip-nonlinetable-debuginfo-loops.ll
@@ -0,0 +1,71 @@
+; RUN: opt -S -strip-nonlinetable-debuginfo %s -o %t
+; RUN: cat %t | FileCheck %s
+; RUN: cat %t | FileCheck %s --check-prefix=NEGATIVE
+; void f(volatile int *i) {
+;   while (--*i) {}
+; }
+source_filename = "/tmp/loop.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+define void @f(i32* %i) local_unnamed_addr #0 !dbg !7 {
+entry:
+  tail call void @llvm.dbg.value(metadata i32* %i, i64 0, metadata !14, metadata !15), !dbg !16
+  br label %while.cond, !dbg !17
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %0 = load volatile i32, i32* %i, align 4, !dbg !18, !tbaa !19
+  %dec = add nsw i32 %0, -1, !dbg !18
+  store volatile i32 %dec, i32* %i, align 4, !dbg !18, !tbaa !19
+  %tobool = icmp eq i32 %dec, 0, !dbg !17
+  ; CHECK: !llvm.loop ![[LOOP:[0-9]+]]
+  br i1 %tobool, label %while.end, label %while.cond, !dbg !17, !llvm.loop !23
+
+while.end:                                        ; preds = %while.cond
+  ret void, !dbg !25
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind ssp uwtable }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+; CHECK: ![[CU:.*]] = distinct !DICompileUnit(language: DW_LANG_C99,
+; CHECK-SAME:                                 emissionKind: LineTablesOnly
+; NEGATIVE-NOT: !DICompileUnit({{.*}} emissionKind: FullDebug
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 (trunk 298880) (llvm/trunk 298875)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "/tmp/loop.c", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"PIC Level", i32 2}
+!6 = !{!"clang version 5.0.0 (trunk 298880) (llvm/trunk 298875)"}
+; CHECK: ![[F:[0-9]]] = distinct !DISubprogram(name: "f", scope: !1
+!7 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !13)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
+!11 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !12)
+; NEGATIVE-NOT: !DIBasicType(name: "int",
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !{!14}
+!14 = !DILocalVariable(name: "i", arg: 1, scope: !7, file: !1, line: 1, type: !10)
+!15 = !DIExpression()
+!16 = !DILocation(line: 1, column: 22, scope: !7)
+; CHECK: ![[BEGIN:[0-9]+]] = !DILocation(line: 2, column: 3, scope: ![[F]])
+!17 = !DILocation(line: 2, column: 3, scope: !7)
+!18 = !DILocation(line: 2, column: 10, scope: !7)
+!19 = !{!20, !20, i64 0}
+!20 = !{!"int", !21, i64 0}
+!21 = !{!"omnipotent char", !22, i64 0}
+!22 = !{!"Simple C/C++ TBAA"}
+; CHECK: ![[LOOP]] = distinct !{![[LOOP]], ![[BEGIN]], ![[END:[0-9]+]]}
+!23 = distinct !{!23, !17, !24}
+; CHECK: ![[END]] = !DILocation(line: 3, column: 3, scope: ![[F]])
+!24 = !DILocation(line: 3, column: 3, scope: !7)
+!25 = !DILocation(line: 4, column: 1, scope: !7)
diff --git a/test/Transforms/WholeProgramDevirt/Inputs/export.yaml b/test/Transforms/WholeProgramDevirt/Inputs/export.yaml
new file mode 100644
index 000000000000..0f6f59de7522
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/Inputs/export.yaml
@@ -0,0 +1,20 @@
+---
+GlobalValueMap:
+  42:
+    - TypeTestAssumeVCalls:
+        - GUID: 14276520915468743435  # typeid1
+          Offset: 0
+      TypeCheckedLoadVCalls:
+        - GUID: 15427464259790519041  # typeid2
+          Offset: 0
+      TypeTestAssumeConstVCalls:
+        - VFunc:
+            GUID: 3515965990081467659  # typeid3
+            Offset: 0
+          Args: [12, 24]
+      TypeCheckedLoadConstVCalls:
+        - VFunc:
+            GUID: 17525413373118030901  # typeid4
+            Offset: 0
+          Args: [24, 12]
+...
diff --git a/test/Transforms/WholeProgramDevirt/Inputs/import-indir.yaml b/test/Transforms/WholeProgramDevirt/Inputs/import-indir.yaml
new file mode 100644
index 000000000000..1cb3ad3f134c
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/Inputs/import-indir.yaml
@@ -0,0 +1,41 @@
+---
+GlobalValueMap:
+  42:
+    - TypeTestAssumeVCalls:
+        - GUID: 123
+          Offset: 0
+        - GUID: 456
+          Offset: 4
+      TypeCheckedLoadVCalls:
+        - GUID: 789
+          Offset: 8
+        - GUID: 1234
+          Offset: 16
+      TypeTestAssumeConstVCalls:
+        - VFunc:
+            GUID: 123
+            Offset: 4
+          Args: [12, 24]
+      TypeCheckedLoadConstVCalls:
+        - VFunc:
+            GUID: 456
+            Offset: 8
+          Args: [24, 12]
+TypeIdMap:
+  typeid1:
+    WPDRes:
+      0:
+        Kind: Indir
+      4:
+        Kind: Indir
+        ResByArg:
+          "":
+            Kind: UniformRetVal
+            Info: 12
+          12:
+            Kind: UniformRetVal
+            Info: 24
+          "12,24":
+            Kind: UniformRetVal
+            Info: 48
+...
diff --git a/test/Transforms/WholeProgramDevirt/Inputs/import-single-impl.yaml b/test/Transforms/WholeProgramDevirt/Inputs/import-single-impl.yaml
new file mode 100644
index 000000000000..26764eb3b29c
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/Inputs/import-single-impl.yaml
@@ -0,0 +1,13 @@
+---
+TypeIdMap:
+  typeid1:
+    WPDRes:
+      0:
+        Kind: SingleImpl
+        SingleImplName: singleimpl1
+  typeid2:
+    WPDRes:
+      8:
+        Kind: SingleImpl
+        SingleImplName: singleimpl2
+...
diff --git a/test/Transforms/WholeProgramDevirt/Inputs/import-uniform-ret-val.yaml b/test/Transforms/WholeProgramDevirt/Inputs/import-uniform-ret-val.yaml
new file mode 100644
index 000000000000..f1daae63b678
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/Inputs/import-uniform-ret-val.yaml
@@ -0,0 +1,19 @@
+---
+TypeIdMap:
+  typeid1:
+    WPDRes:
+      0:
+        Kind: Indir
+        ResByArg:
+          1:
+            Kind: UniformRetVal
+            Info: 42
+  typeid2:
+    WPDRes:
+      8:
+        Kind: Indir
+        ResByArg:
+          1:
+            Kind: UniformRetVal
+            Info: 42
+...
diff --git a/test/Transforms/WholeProgramDevirt/Inputs/import-unique-ret-val0.yaml b/test/Transforms/WholeProgramDevirt/Inputs/import-unique-ret-val0.yaml
new file mode 100644
index 000000000000..597b17877767
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/Inputs/import-unique-ret-val0.yaml
@@ -0,0 +1,11 @@
+---
+TypeIdMap:
+  typeid2:
+    WPDRes:
+      8:
+        Kind: Indir
+        ResByArg:
+          3:
+            Kind: UniqueRetVal
+            Info: 0
+...
diff --git a/test/Transforms/WholeProgramDevirt/Inputs/import-unique-ret-val1.yaml b/test/Transforms/WholeProgramDevirt/Inputs/import-unique-ret-val1.yaml
new file mode 100644
index 000000000000..737ef1173c3c
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/Inputs/import-unique-ret-val1.yaml
@@ -0,0 +1,11 @@
+---
+TypeIdMap:
+  typeid2:
+    WPDRes:
+      8:
+        Kind: Indir
+        ResByArg:
+          3:
+            Kind: UniqueRetVal
+            Info: 1
+...
diff --git a/test/Transforms/WholeProgramDevirt/Inputs/import-vcp.yaml b/test/Transforms/WholeProgramDevirt/Inputs/import-vcp.yaml
new file mode 100644
index 000000000000..4fbee126d0ea
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/Inputs/import-vcp.yaml
@@ -0,0 +1,19 @@
+---
+TypeIdMap:
+  typeid1:
+    WPDRes:
+      0:
+        Kind: Indir
+        ResByArg:
+          1:
+            Kind: VirtualConstProp
+            Info: 0
+  typeid2:
+    WPDRes:
+      8:
+        Kind: Indir
+        ResByArg:
+          3:
+            Kind: VirtualConstProp
+            Info: 0
+...
diff --git a/test/Transforms/WholeProgramDevirt/bad-read-from-vtable.ll b/test/Transforms/WholeProgramDevirt/bad-read-from-vtable.ll
index 4885be777566..e5d0e74b22e2 100644
--- a/test/Transforms/WholeProgramDevirt/bad-read-from-vtable.ll
+++ b/test/Transforms/WholeProgramDevirt/bad-read-from-vtable.ll
@@ -3,8 +3,8 @@
 target datalayout = "e-p:64:64"
 target triple = "x86_64-unknown-linux-gnu"
 
-@vt1 = global [2 x i8*] [i8* zeroinitializer, i8* bitcast (void (i8*)* @vf to i8*)], !type !0
-@vt2 = global i8* bitcast (void (i8*)* @vf to i8*), !type !1
+@vt1 = constant [2 x i8*] [i8* zeroinitializer, i8* bitcast (void (i8*)* @vf to i8*)], !type !0
+@vt2 = constant i8* bitcast (void (i8*)* @vf to i8*), !type !1
 
 define void @vf(i8* %this) {
   ret void
diff --git a/test/Transforms/WholeProgramDevirt/export-nothing.ll b/test/Transforms/WholeProgramDevirt/export-nothing.ll
new file mode 100644
index 000000000000..e0814efbf9c0
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/export-nothing.ll
@@ -0,0 +1,7 @@
+; RUN: opt -wholeprogramdevirt -wholeprogramdevirt-summary-action=export -wholeprogramdevirt-write-summary=%t -o /dev/null %s
+; RUN: FileCheck %s < %t
+
+; CHECK: ---
+; CHECK-NEXT: GlobalValueMap:
+; CHECK-NEXT: TypeIdMap:
+; CHECK-NEXT: ...
diff --git a/test/Transforms/WholeProgramDevirt/export-single-impl.ll b/test/Transforms/WholeProgramDevirt/export-single-impl.ll
new file mode 100644
index 000000000000..f4f3fd054c46
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/export-single-impl.ll
@@ -0,0 +1,78 @@
+; RUN: opt -wholeprogramdevirt -wholeprogramdevirt-summary-action=export -wholeprogramdevirt-read-summary=%S/Inputs/export.yaml -wholeprogramdevirt-write-summary=%t -S -o - %s | FileCheck %s
+; RUN: FileCheck --check-prefix=SUMMARY %s < %t
+
+; SUMMARY:      TypeIdMap:
+; SUMMARY-NEXT:   typeid1:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Unsat
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:       0:
+; SUMMARY-NEXT:         Kind:            SingleImpl
+; SUMMARY-NEXT:         SingleImplName:  vf1
+; SUMMARY-NEXT:         ResByArg:
+; SUMMARY-NEXT:   typeid2:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Unsat
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:       0:
+; SUMMARY-NEXT:         Kind:            SingleImpl
+; SUMMARY-NEXT:         SingleImplName:  vf2
+; SUMMARY-NEXT:         ResByArg:
+; SUMMARY-NEXT:   typeid3:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Unsat
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:       0:
+; SUMMARY-NEXT:         Kind:            SingleImpl
+; SUMMARY-NEXT:         SingleImplName:  vf3
+; SUMMARY-NEXT:         ResByArg:
+; SUMMARY-NEXT:   typeid4:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Unsat
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:       0:
+; SUMMARY-NEXT:         Kind:            SingleImpl
+; SUMMARY-NEXT:         SingleImplName:  'vf4$merged'
+; SUMMARY-NEXT:         ResByArg:
+; SUMMARY-NEXT: ...
+
+; CHECK: @vt1 = constant void (i8*)* @vf1
+@vt1 = constant void (i8*)* @vf1, !type !0
+
+; CHECK: @vt2 = constant void (i8*)* @vf2
+@vt2 = constant void (i8*)* @vf2, !type !1
+
+@vt3 = constant void (i8*)* @vf3, !type !2
+
+; CHECK: @vt4 = constant void (i8*)* @"vf4$merged"
+@vt4 = constant void (i8*)* @vf4, !type !3
+
+@vt5 = constant void (i8*)* @vf5, !type !4
+
+; CHECK: declare void @vf1(i8*)
+declare void @vf1(i8*)
+
+; CHECK: define void @vf2(i8*)
+define void @vf2(i8*) {
+  ret void
+}
+
+declare void @vf3(i8*)
+
+; CHECK: define hidden void @"vf4$merged"
+define internal void @vf4(i8*) {
+  ret void
+}
+
+declare void @vf5(i8*)
+
+!0 = !{i32 0, !"typeid1"}
+!1 = !{i32 0, !"typeid2"}
+!2 = !{i32 0, !"typeid3"}
+!3 = !{i32 0, !"typeid4"}
+!4 = !{i32 0, !5}
+!5 = distinct !{}
diff --git a/test/Transforms/WholeProgramDevirt/export-uniform-ret-val.ll b/test/Transforms/WholeProgramDevirt/export-uniform-ret-val.ll
new file mode 100644
index 000000000000..1d7030c41fd0
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/export-uniform-ret-val.ll
@@ -0,0 +1,36 @@
+; RUN: opt -wholeprogramdevirt -wholeprogramdevirt-summary-action=export -wholeprogramdevirt-read-summary=%S/Inputs/export.yaml -wholeprogramdevirt-write-summary=%t -S -o - %s | FileCheck %s
+; RUN: FileCheck --check-prefix=SUMMARY %s < %t
+
+; SUMMARY:     - TypeTests:
+; SUMMARY-NEXT:  TypeTestAssumeVCalls:
+
+; SUMMARY:      TypeIdMap:
+; SUMMARY-NEXT:   typeid4:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Unsat
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:       0:
+; SUMMARY-NEXT:         Kind:            Indir
+; SUMMARY-NEXT:         SingleImplName:  ''
+; SUMMARY-NEXT:         ResByArg:
+; SUMMARY-NEXT:           24,12:
+; SUMMARY-NEXT:             Kind:            UniformRetVal
+; SUMMARY-NEXT:             Info:            36
+
+; CHECK: @vt4a = constant i32 (i8*, i32, i32)* @vf4a
+@vt4a = constant i32 (i8*, i32, i32)* @vf4a, !type !0
+
+; CHECK: @vt4b = constant i32 (i8*, i32, i32)* @vf4b
+@vt4b = constant i32 (i8*, i32, i32)* @vf4b, !type !0
+
+define i32 @vf4a(i8*, i32 %x, i32 %y) {
+  %z = add i32 %x, %y
+  ret i32 %z
+}
+
+define i32 @vf4b(i8*, i32 %x, i32 %y) {
+  ret i32 36
+}
+
+!0 = !{i32 0, !"typeid4"}
diff --git a/test/Transforms/WholeProgramDevirt/export-unique-ret-val.ll b/test/Transforms/WholeProgramDevirt/export-unique-ret-val.ll
new file mode 100644
index 000000000000..174a573b5b0d
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/export-unique-ret-val.ll
@@ -0,0 +1,79 @@
+; RUN: opt -wholeprogramdevirt -wholeprogramdevirt-summary-action=export -wholeprogramdevirt-read-summary=%S/Inputs/export.yaml -wholeprogramdevirt-write-summary=%t -S -o - %s | FileCheck %s
+; RUN: FileCheck --check-prefix=SUMMARY %s < %t
+
+; SUMMARY:     - TypeTests:
+; SUMMARY-NEXT:  TypeTestAssumeVCalls:
+
+; SUMMARY:      TypeIdMap:
+; SUMMARY-NEXT:   typeid3:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Unsat
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:       0:
+; SUMMARY-NEXT:         Kind:            Indir
+; SUMMARY-NEXT:         SingleImplName:  ''
+; SUMMARY-NEXT:         ResByArg:
+; SUMMARY-NEXT:           12,24:
+; SUMMARY-NEXT:             Kind:            UniqueRetVal
+; SUMMARY-NEXT:             Info:            0
+; SUMMARY-NEXT:   typeid4:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Unsat
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:       0:
+; SUMMARY-NEXT:         Kind:            Indir
+; SUMMARY-NEXT:         SingleImplName:  ''
+; SUMMARY-NEXT:         ResByArg:
+; SUMMARY-NEXT:           24,12:
+; SUMMARY-NEXT:             Kind:            UniqueRetVal
+; SUMMARY-NEXT:             Info:            1
+
+; CHECK: @vt3a = constant i1 (i8*, i32, i32)* @vf3a
+@vt3a = constant i1 (i8*, i32, i32)* @vf3a, !type !0
+
+; CHECK: @vt3b = constant i1 (i8*, i32, i32)* @vf3b
+@vt3b = constant i1 (i8*, i32, i32)* @vf3b, !type !0
+
+; CHECK: @vt3c = constant i1 (i8*, i32, i32)* @vf3c
+@vt3c = constant i1 (i8*, i32, i32)* @vf3c, !type !0
+
+; CHECK: @vt4a = constant i1 (i8*, i32, i32)* @vf4a
+@vt4a = constant i1 (i8*, i32, i32)* @vf4a, !type !1
+
+; CHECK: @vt4b = constant i1 (i8*, i32, i32)* @vf4b
+@vt4b = constant i1 (i8*, i32, i32)* @vf4b, !type !1
+
+; CHECK: @vt4c = constant i1 (i8*, i32, i32)* @vf4c
+@vt4c = constant i1 (i8*, i32, i32)* @vf4c, !type !1
+
+; CHECK: @__typeid_typeid3_0_12_24_unique_member = hidden alias i8, bitcast (i1 (i8*, i32, i32)** @vt3b to i8*)
+; CHECK: @__typeid_typeid4_0_24_12_unique_member = hidden alias i8, bitcast (i1 (i8*, i32, i32)** @vt4b to i8*)
+
+define i1 @vf3a(i8*, i32, i32) {
+  ret i1 true
+}
+
+define i1 @vf3b(i8*, i32, i32) {
+  ret i1 false
+}
+
+define i1 @vf3c(i8*, i32, i32) {
+  ret i1 true
+}
+
+define i1 @vf4a(i8*, i32, i32) {
+  ret i1 false
+}
+
+define i1 @vf4b(i8*, i32, i32) {
+  ret i1 true
+}
+
+define i1 @vf4c(i8*, i32, i32) {
+  ret i1 false
+}
+
+!0 = !{i32 0, !"typeid3"}
+!1 = !{i32 0, !"typeid4"}
diff --git a/test/Transforms/WholeProgramDevirt/export-unsuccessful-checked.ll b/test/Transforms/WholeProgramDevirt/export-unsuccessful-checked.ll
new file mode 100644
index 000000000000..0785ade28570
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/export-unsuccessful-checked.ll
@@ -0,0 +1,28 @@
+; RUN: opt -wholeprogramdevirt -wholeprogramdevirt-summary-action=export -wholeprogramdevirt-read-summary=%S/Inputs/export.yaml -wholeprogramdevirt-write-summary=%t -o /dev/null %s
+; RUN: FileCheck %s < %t
+
+; CHECK:     - TypeTests: [ 15427464259790519041, 17525413373118030901 ]
+; CHECK-NEXT:  TypeTestAssumeVCalls:
+
+@vt1a = constant void (i8*)* @vf1a, !type !0
+@vt1b = constant void (i8*)* @vf1b, !type !0
+@vt2a = constant void (i8*)* @vf2a, !type !1
+@vt2b = constant void (i8*)* @vf2b, !type !1
+@vt3a = constant void (i8*)* @vf3a, !type !2
+@vt3b = constant void (i8*)* @vf3b, !type !2
+@vt4a = constant void (i8*)* @vf4a, !type !3
+@vt4b = constant void (i8*)* @vf4b, !type !3
+
+declare void @vf1a(i8*)
+declare void @vf1b(i8*)
+declare void @vf2a(i8*)
+declare void @vf2b(i8*)
+declare void @vf3a(i8*)
+declare void @vf3b(i8*)
+declare void @vf4a(i8*)
+declare void @vf4b(i8*)
+
+!0 = !{i32 0, !"typeid1"}
+!1 = !{i32 0, !"typeid2"}
+!2 = !{i32 0, !"typeid3"}
+!3 = !{i32 0, !"typeid4"}
diff --git a/test/Transforms/WholeProgramDevirt/export-vcp.ll b/test/Transforms/WholeProgramDevirt/export-vcp.ll
new file mode 100644
index 000000000000..8e6e69b9bd43
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/export-vcp.ll
@@ -0,0 +1,83 @@
+; RUN: opt -wholeprogramdevirt -wholeprogramdevirt-summary-action=export -wholeprogramdevirt-read-summary=%S/Inputs/export.yaml -wholeprogramdevirt-write-summary=%t -S -o - %s | FileCheck %s
+; RUN: FileCheck --check-prefix=SUMMARY %s < %t
+
+target datalayout = "e-p:64:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+; SUMMARY:      TypeIdMap:
+; SUMMARY-NEXT:   typeid3:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Unsat
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:       0:
+; SUMMARY-NEXT:         Kind:            Indir
+; SUMMARY-NEXT:         SingleImplName:  ''
+; SUMMARY-NEXT:         ResByArg:
+; SUMMARY-NEXT:           12,24:
+; SUMMARY-NEXT:             Kind:            VirtualConstProp
+; SUMMARY-NEXT:             Info:            0
+; SUMMARY-NEXT:   typeid4:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Unsat
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:       0:
+; SUMMARY-NEXT:         Kind:            Indir
+; SUMMARY-NEXT:         SingleImplName:  ''
+; SUMMARY-NEXT:         ResByArg:
+; SUMMARY-NEXT:           24,12:
+; SUMMARY-NEXT:             Kind:            VirtualConstProp
+; SUMMARY-NEXT:             Info:            0
+
+; CHECK: [[CVT3A:.*]] = private constant { [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] } { [8 x i8] zeroinitializer, i1 (i8*, i32, i32)* @vf0i1, [0 x i8] zeroinitializer }, !type !0
+@vt3a = constant i1 (i8*, i32, i32)* @vf0i1, !type !0
+
+; CHECK: [[CVT3B:.*]] = private constant { [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] } { [8 x i8] c"\00\00\00\00\00\00\00\01", i1 (i8*, i32, i32)* @vf1i1, [0 x i8] zeroinitializer }, !type !0
+@vt3b = constant i1 (i8*, i32, i32)* @vf1i1, !type !0
+
+; CHECK: [[CVT3C:.*]] = private constant { [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] } { [8 x i8] zeroinitializer, i1 (i8*, i32, i32)* @vf0i1, [0 x i8] zeroinitializer }, !type !0
+@vt3c = constant i1 (i8*, i32, i32)* @vf0i1, !type !0
+
+; CHECK: [[CVT3D:.*]] = private constant { [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] } { [8 x i8] c"\00\00\00\00\00\00\00\01", i1 (i8*, i32, i32)* @vf1i1, [0 x i8] zeroinitializer }, !type !0
+@vt3d = constant i1 (i8*, i32, i32)* @vf1i1, !type !0
+
+; CHECK: [[CVT4A:.*]] = private constant { [8 x i8], i32 (i8*, i32, i32)*, [0 x i8] } { [8 x i8] c"\00\00\00\00\01\00\00\00", i32 (i8*, i32, i32)* @vf1i32, [0 x i8] zeroinitializer }, !type !1
+@vt4a = constant i32 (i8*, i32, i32)* @vf1i32, !type !1
+
+; CHECK: [[CVT4B:.*]] = private constant { [8 x i8], i32 (i8*, i32, i32)*, [0 x i8] } { [8 x i8] c"\00\00\00\00\02\00\00\00", i32 (i8*, i32, i32)* @vf2i32, [0 x i8] zeroinitializer }, !type !1
+@vt4b = constant i32 (i8*, i32, i32)* @vf2i32, !type !1
+
+; CHECK: @__typeid_typeid3_0_12_24_byte = hidden alias i8, inttoptr (i32 -1 to i8*)
+; CHECK: @__typeid_typeid3_0_12_24_bit = hidden alias i8, inttoptr (i8 1 to i8*)
+; CHECK: @__typeid_typeid4_0_24_12_byte = hidden alias i8, inttoptr (i32 -4 to i8*)
+; CHECK: @__typeid_typeid4_0_24_12_bit = hidden alias i8, inttoptr (i8 1 to i8*)
+
+; CHECK: @vt3a = alias i1 (i8*, i32, i32)*, getelementptr inbounds ({ [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] }, { [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] }* [[CVT3A]], i32 0, i32 1)
+; CHECK: @vt3b = alias i1 (i8*, i32, i32)*, getelementptr inbounds ({ [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] }, { [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] }* [[CVT3B]], i32 0, i32 1)
+; CHECK: @vt3c = alias i1 (i8*, i32, i32)*, getelementptr inbounds ({ [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] }, { [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] }* [[CVT3C]], i32 0, i32 1)
+; CHECK: @vt3d = alias i1 (i8*, i32, i32)*, getelementptr inbounds ({ [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] }, { [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] }* [[CVT3D]], i32 0, i32 1)
+; CHECK: @vt4a = alias i32 (i8*, i32, i32)*, getelementptr inbounds ({ [8 x i8], i32 (i8*, i32, i32)*, [0 x i8] }, { [8 x i8], i32 (i8*, i32, i32)*, [0 x i8] }* [[CVT4A]], i32 0, i32 1)
+; CHECK: @vt4b = alias i32 (i8*, i32, i32)*, getelementptr inbounds ({ [8 x i8], i32 (i8*, i32, i32)*, [0 x i8] }, { [8 x i8], i32 (i8*, i32, i32)*, [0 x i8] }* [[CVT4B]], i32 0, i32 1)
+
+define i1 @vf0i1(i8* %this, i32, i32) readnone {
+  ret i1 0
+}
+
+define i1 @vf1i1(i8* %this, i32, i32) readnone {
+  ret i1 1
+}
+
+define i32 @vf1i32(i8* %this, i32, i32) readnone {
+  ret i32 1
+}
+
+define i32 @vf2i32(i8* %this, i32, i32) readnone {
+  ret i32 2
+}
+
+; CHECK: !0 = !{i32 8, !"typeid3"}
+; CHECK: !1 = !{i32 8, !"typeid4"}
+
+!0 = !{i32 0, !"typeid3"}
+!1 = !{i32 0, !"typeid4"}
diff --git a/test/Transforms/WholeProgramDevirt/import-indir.ll b/test/Transforms/WholeProgramDevirt/import-indir.ll
new file mode 100644
index 000000000000..1de9352eeb22
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/import-indir.ll
@@ -0,0 +1,95 @@
+; Test that we correctly import an indir resolution for type identifier "typeid1".
+; RUN: opt -S -wholeprogramdevirt -wholeprogramdevirt-summary-action=import -wholeprogramdevirt-read-summary=%S/Inputs/import-indir.yaml -wholeprogramdevirt-write-summary=%t < %s | FileCheck %s
+; RUN: FileCheck --check-prefix=SUMMARY %s < %t
+
+; SUMMARY:     GlobalValueMap:
+; SUMMARY-NEXT:  42:
+; SUMMARY-NEXT:    - TypeTests:
+; SUMMARY-NEXT:      TypeTestAssumeVCalls:
+; SUMMARY-NEXT:        - GUID:            123
+; SUMMARY-NEXT:          Offset:          0
+; SUMMARY-NEXT:        - GUID:            456
+; SUMMARY-NEXT:          Offset:          4
+; SUMMARY-NEXT:      TypeCheckedLoadVCalls:
+; SUMMARY-NEXT:        - GUID:            789
+; SUMMARY-NEXT:          Offset:          8
+; SUMMARY-NEXT:        - GUID:            1234
+; SUMMARY-NEXT:          Offset:          16
+; SUMMARY-NEXT:      TypeTestAssumeConstVCalls:
+; SUMMARY-NEXT:        - VFunc:
+; SUMMARY-NEXT:            GUID:            123
+; SUMMARY-NEXT:            Offset:          4
+; SUMMARY-NEXT:          Args: [ 12, 24 ]
+; SUMMARY-NEXT:      TypeCheckedLoadConstVCalls:
+; SUMMARY-NEXT:        - VFunc:
+; SUMMARY-NEXT:            GUID:            456
+; SUMMARY-NEXT:            Offset:          8
+; SUMMARY-NEXT:          Args: [ 24, 12 ]
+; SUMMARY-NEXT: TypeIdMap:
+; SUMMARY-NEXT:   typeid1:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Unsat
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:       0:
+; SUMMARY-NEXT:         Kind:            Indir
+; SUMMARY-NEXT:         SingleImplName:  ''
+; SUMMARY-NEXT:         ResByArg:
+; SUMMARY-NEXT:       4:
+; SUMMARY-NEXT:         Kind:            Indir
+; SUMMARY-NEXT:         SingleImplName:  ''
+; SUMMARY-NEXT:         ResByArg:
+; SUMMARY-NEXT:           :
+; SUMMARY-NEXT:             Kind:            UniformRetVal
+; SUMMARY-NEXT:             Info:            12
+; SUMMARY-NEXT:           12:
+; SUMMARY-NEXT:             Kind:            UniformRetVal
+; SUMMARY-NEXT:             Info:            24
+; SUMMARY-NEXT:           12,24:
+; SUMMARY-NEXT:             Kind:            UniformRetVal
+; SUMMARY-NEXT:             Info:            48
+
+target datalayout = "e-p:32:32"
+
+declare void @llvm.assume(i1)
+declare void @llvm.trap()
+declare {i8*, i1} @llvm.type.checked.load(i8*, i32, metadata)
+declare i1 @llvm.type.test(i8*, metadata)
+
+; CHECK: define i1 @f1
+define i1 @f1(i8* %obj) {
+  %vtableptr = bitcast i8* %obj to [1 x i8*]**
+  %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
+  %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"typeid1")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr [1 x i8*], [1 x i8*]* %vtable, i32 0, i32 0
+  %fptr = load i8*, i8** %fptrptr
+  %fptr_casted = bitcast i8* %fptr to i1 (i8*, i32)*
+  ; CHECK: call i1 %
+  %result = call i1 %fptr_casted(i8* %obj, i32 5)
+  ret i1 %result
+}
+
+; CHECK: define i1 @f2
+define i1 @f2(i8* %obj) {
+  %vtableptr = bitcast i8* %obj to [1 x i8*]**
+  %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
+  %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
+  %pair = call {i8*, i1} @llvm.type.checked.load(i8* %vtablei8, i32 4, metadata !"typeid1")
+  %fptr = extractvalue {i8*, i1} %pair, 0
+  %p = extractvalue {i8*, i1} %pair, 1
+  ; CHECK: [[P:%.*]] = call i1 @llvm.type.test
+  ; CHECK: br i1 [[P]]
+  br i1 %p, label %cont, label %trap
+
+cont:
+  %fptr_casted = bitcast i8* %fptr to i1 (i8*, i32)*
+  ; CHECK: call i1 %
+  %result = call i1 %fptr_casted(i8* %obj, i32 undef)
+  ret i1 %result
+
+trap:
+  call void @llvm.trap()
+  unreachable
+}
diff --git a/test/Transforms/WholeProgramDevirt/import.ll b/test/Transforms/WholeProgramDevirt/import.ll
new file mode 100644
index 000000000000..7f34b04ce119
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/import.ll
@@ -0,0 +1,108 @@
+; RUN: opt -S -wholeprogramdevirt -wholeprogramdevirt-summary-action=import -wholeprogramdevirt-read-summary=%S/Inputs/import-single-impl.yaml < %s | FileCheck --check-prefixes=CHECK,SINGLE-IMPL %s
+; RUN: opt -S -wholeprogramdevirt -wholeprogramdevirt-summary-action=import -wholeprogramdevirt-read-summary=%S/Inputs/import-uniform-ret-val.yaml < %s | FileCheck --check-prefixes=CHECK,UNIFORM-RET-VAL %s
+; RUN: opt -S -wholeprogramdevirt -wholeprogramdevirt-summary-action=import -wholeprogramdevirt-read-summary=%S/Inputs/import-unique-ret-val0.yaml < %s | FileCheck --check-prefixes=CHECK,UNIQUE-RET-VAL0 %s
+; RUN: opt -S -wholeprogramdevirt -wholeprogramdevirt-summary-action=import -wholeprogramdevirt-read-summary=%S/Inputs/import-unique-ret-val1.yaml < %s | FileCheck --check-prefixes=CHECK,UNIQUE-RET-VAL1 %s
+; RUN: opt -S -wholeprogramdevirt -wholeprogramdevirt-summary-action=import -wholeprogramdevirt-read-summary=%S/Inputs/import-vcp.yaml < %s | FileCheck --check-prefixes=CHECK,VCP,VCP64 %s
+; RUN: opt -S -wholeprogramdevirt -wholeprogramdevirt-summary-action=import -wholeprogramdevirt-read-summary=%S/Inputs/import-vcp.yaml -mtriple=i686-unknown-linux -data-layout=e-p:32:32 < %s | FileCheck --check-prefixes=CHECK,VCP,VCP32 %s
+
+target datalayout = "e-p:64:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+; VCP: @__typeid_typeid1_0_1_byte = external hidden global i8, !absolute_symbol !0
+; VCP: @__typeid_typeid1_0_1_bit = external hidden global i8, !absolute_symbol !1
+; VCP: @__typeid_typeid2_8_3_byte = external hidden global i8, !absolute_symbol !0
+; VCP: @__typeid_typeid2_8_3_bit = external hidden global i8, !absolute_symbol !1
+
+; Test cases where the argument values are known and we can apply virtual
+; constant propagation.
+
+; CHECK: define i32 @call1
+define i32 @call1(i8* %obj) {
+  %vtableptr = bitcast i8* %obj to [3 x i8*]**
+  %vtable = load [3 x i8*]*, [3 x i8*]** %vtableptr
+  %vtablei8 = bitcast [3 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"typeid1")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr [3 x i8*], [3 x i8*]* %vtable, i32 0, i32 0
+  %fptr = load i8*, i8** %fptrptr
+  %fptr_casted = bitcast i8* %fptr to i32 (i8*, i32)*
+  ; SINGLE-IMPL: call i32 bitcast (void ()* @singleimpl1 to i32 (i8*, i32)*)
+  %result = call i32 %fptr_casted(i8* %obj, i32 1)
+  ; UNIFORM-RET-VAL: ret i32 42
+  ; VCP: [[VT1:%.*]] = bitcast {{.*}} to i8*
+  ; VCP: [[GEP1:%.*]] = getelementptr i8, i8* [[VT1]], i32 ptrtoint (i8* @__typeid_typeid1_0_1_byte to i32)
+  ; VCP: [[BC1:%.*]] = bitcast i8* [[GEP1]] to i32*
+  ; VCP: [[LOAD1:%.*]] = load i32, i32* [[BC1]]
+  ; VCP: ret i32 [[LOAD1]]
+  ret i32 %result
+}
+
+; Test cases where the argument values are unknown, so we cannot apply virtual
+; constant propagation.
+
+; CHECK: define i1 @call2
+define i1 @call2(i8* %obj) {
+  %vtableptr = bitcast i8* %obj to [1 x i8*]**
+  %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
+  %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
+  %pair = call {i8*, i1} @llvm.type.checked.load(i8* %vtablei8, i32 8, metadata !"typeid2")
+  %fptr = extractvalue {i8*, i1} %pair, 0
+  %p = extractvalue {i8*, i1} %pair, 1
+  ; SINGLE-IMPL: br i1 true,
+  br i1 %p, label %cont, label %trap
+
+cont:
+  %fptr_casted = bitcast i8* %fptr to i1 (i8*, i32)*
+  ; SINGLE-IMPL: call i1 bitcast (void ()* @singleimpl2 to i1 (i8*, i32)*)
+  ; UNIFORM-RET-VAL: call i1 %
+  ; UNIQUE-RET-VAL0: call i1 %
+  ; UNIQUE-RET-VAL1: call i1 %
+  %result = call i1 %fptr_casted(i8* %obj, i32 undef)
+  ret i1 %result
+
+trap:
+  call void @llvm.trap()
+  unreachable
+}
+
+; CHECK: define i1 @call3
+define i1 @call3(i8* %obj) {
+  %vtableptr = bitcast i8* %obj to [1 x i8*]**
+  %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
+  %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
+  %pair = call {i8*, i1} @llvm.type.checked.load(i8* %vtablei8, i32 8, metadata !"typeid2")
+  %fptr = extractvalue {i8*, i1} %pair, 0
+  %p = extractvalue {i8*, i1} %pair, 1
+  br i1 %p, label %cont, label %trap
+
+cont:
+  %fptr_casted = bitcast i8* %fptr to i1 (i8*, i32)*
+  %result = call i1 %fptr_casted(i8* %obj, i32 3)
+  ; UNIQUE-RET-VAL0: icmp ne i8* %vtablei8, @__typeid_typeid2_8_3_unique_member
+  ; UNIQUE-RET-VAL1: icmp eq i8* %vtablei8, @__typeid_typeid2_8_3_unique_member
+  ; VCP: [[VT2:%.*]] = bitcast {{.*}} to i8*
+  ; VCP: [[GEP2:%.*]] = getelementptr i8, i8* [[VT2]], i32 ptrtoint (i8* @__typeid_typeid2_8_3_byte to i32)
+  ; VCP: [[LOAD2:%.*]] = load i8, i8* [[GEP2]]
+  ; VCP: [[AND2:%.*]] = and i8 [[LOAD2]], ptrtoint (i8* @__typeid_typeid2_8_3_bit to i8)
+  ; VCP: [[ICMP2:%.*]] = icmp ne i8 [[AND2]], 0
+  ; VCP: ret i1 [[ICMP2]]
+  ret i1 %result
+
+trap:
+  call void @llvm.trap()
+  unreachable
+}
+
+; SINGLE-IMPL-DAG: declare void @singleimpl1()
+; SINGLE-IMPL-DAG: declare void @singleimpl2()
+
+; VCP32: !0 = !{i32 -1, i32 -1}
+; VCP64: !0 = !{i64 0, i64 4294967296}
+
+; VCP32: !1 = !{i32 0, i32 256}
+; VCP64: !1 = !{i64 0, i64 256}
+
+declare void @llvm.assume(i1)
+declare void @llvm.trap()
+declare {i8*, i1} @llvm.type.checked.load(i8*, i32, metadata)
+declare i1 @llvm.type.test(i8*, metadata)
diff --git a/test/Transforms/WholeProgramDevirt/unique-retval.ll b/test/Transforms/WholeProgramDevirt/unique-retval.ll
index 50b938c43e4a..e9ae176fe8ac 100644
--- a/test/Transforms/WholeProgramDevirt/unique-retval.ll
+++ b/test/Transforms/WholeProgramDevirt/unique-retval.ll
@@ -33,8 +33,8 @@ define i1 @call1(i8* %obj) {
   ret i1 %result
 }
 
-; CHECK: define i1 @call2
-define i1 @call2(i8* %obj) {
+; CHECK: define i32 @call2
+define i32 @call2(i8* %obj) {
   %vtableptr = bitcast i8* %obj to [1 x i8*]**
   %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
   ; CHECK: [[VT2:%[^ ]*]] = bitcast [1 x i8*]* {{.*}} to i8*
@@ -43,10 +43,13 @@ define i1 @call2(i8* %obj) {
   call void @llvm.assume(i1 %p)
   %fptrptr = getelementptr [1 x i8*], [1 x i8*]* %vtable, i32 0, i32 0
   %fptr = load i8*, i8** %fptrptr
-  %fptr_casted = bitcast i8* %fptr to i1 (i8*)*
-  ; CHECK: [[RES1:%[^ ]*]] = icmp ne i8* [[VT1]], bitcast ([1 x i8*]* @vt2 to i8*)
-  %result = call i1 %fptr_casted(i8* %obj)
-  ret i1 %result
+  ; Intentional type mismatch to test zero extend.
+  %fptr_casted = bitcast i8* %fptr to i32 (i8*)*
+  ; CHECK: [[RES2:%[^ ]*]] = icmp ne i8* [[VT1]], bitcast ([1 x i8*]* @vt2 to i8*)
+  %result = call i32 %fptr_casted(i8* %obj)
+  ; CHECK: [[ZEXT2:%[^ ]*]] = zext i1 [[RES2]] to i32
+  ; CHECK: ret i32 [[ZEXT2:%[^ ]*]]
+  ret i32 %result
 }
 
 declare i1 @llvm.type.test(i8*, metadata)
diff --git a/test/Transforms/WholeProgramDevirt/vcp-accesses-memory.ll b/test/Transforms/WholeProgramDevirt/vcp-accesses-memory.ll
index b5d51f2d4637..ca76383c4943 100644
--- a/test/Transforms/WholeProgramDevirt/vcp-accesses-memory.ll
+++ b/test/Transforms/WholeProgramDevirt/vcp-accesses-memory.ll
@@ -1,21 +1,37 @@
 ; RUN: opt -S -wholeprogramdevirt %s | FileCheck %s
+; RUN: opt -S -passes=wholeprogramdevirt %s | FileCheck %s
 
 target datalayout = "e-p:64:64"
 target triple = "x86_64-unknown-linux-gnu"
 
-@vt1 = global [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf1 to i8*)], !type !0
-@vt2 = global [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf2 to i8*)], !type !0
+@vt1 = constant [2 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf1a to i8*), i8* bitcast (i32 (i8*, i32)* @vf1b to i8*)], !type !0
+@vt2 = constant [2 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf2a to i8*), i8* bitcast (i32 (i8*, i32)* @vf2b to i8*)], !type !0
 
-define i32 @vf1(i8* %this, i32 %arg) {
+@sink = external global i32
+
+define i32 @vf1a(i8* %this, i32 %arg) {
+  store i32 %arg, i32* @sink
+  ret i32 %arg
+}
+
+define i32 @vf2a(i8* %this, i32 %arg) {
+  store i32 %arg, i32* @sink
+  ret i32 %arg
+}
+
+define i32 @vf1b(i8* %this, i32 %arg) {
   ret i32 %arg
 }
 
-define i32 @vf2(i8* %this, i32 %arg) {
+define i32 @vf2b(i8* %this, i32 %arg) {
   ret i32 %arg
 }
 
-; CHECK: define i32 @call
-define i32 @call(i8* %obj) {
+; Test that we don't apply VCP if the virtual function body accesses memory,
+; even if the function returns a constant.
+
+; CHECK: define i32 @call1
+define i32 @call1(i8* %obj) {
   %vtableptr = bitcast i8* %obj to [1 x i8*]**
   %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
   %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
@@ -29,6 +45,24 @@ define i32 @call(i8* %obj) {
   ret i32 %result
 }
 
+; Test that we can apply VCP regardless of the function attributes by analyzing
+; the function body itself.
+
+; CHECK: define i32 @call2
+define i32 @call2(i8* %obj) {
+  %vtableptr = bitcast i8* %obj to [1 x i8*]**
+  %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
+  %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"typeid")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr [1 x i8*], [1 x i8*]* %vtable, i32 0, i32 1
+  %fptr = load i8*, i8** %fptrptr
+  %fptr_casted = bitcast i8* %fptr to i32 (i8*, i32)*
+  %result = call i32 %fptr_casted(i8* %obj, i32 1)
+  ; CHECK: ret i32 1
+  ret i32 %result
+}
+
 declare i1 @llvm.type.test(i8*, metadata)
 declare void @llvm.assume(i1)
 
diff --git a/test/Transforms/WholeProgramDevirt/vcp-decl.ll b/test/Transforms/WholeProgramDevirt/vcp-decl.ll
new file mode 100644
index 000000000000..1c4e2fbe97aa
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/vcp-decl.ll
@@ -0,0 +1,32 @@
+; RUN: opt -S -wholeprogramdevirt %s | FileCheck %s
+
+target datalayout = "e-p:64:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+@vt1 = constant [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf1 to i8*)], !type !0
+@vt2 = constant [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf2 to i8*)], !type !0
+
+declare i32 @vf1(i8* %this, i32 %arg) readnone
+
+define i32 @vf2(i8* %this, i32 %arg) readnone {
+  ret i32 %arg
+}
+
+; CHECK: define i32 @fn
+define i32 @fn(i8* %obj) {
+  %vtableptr = bitcast i8* %obj to [1 x i8*]**
+  %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
+  %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"typeid")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr [1 x i8*], [1 x i8*]* %vtable, i32 0, i32 0
+  %fptr = load i8*, i8** %fptrptr
+  %fptr_casted = bitcast i8* %fptr to i32 (i8*, i32)*
+  ; CHECK: call i32 %
+  %result = call i32 %fptr_casted(i8* %obj, i32 1)
+  ret i32 %result
+}
+declare i1 @llvm.type.test(i8*, metadata)
+declare void @llvm.assume(i1)
+
+!0 = !{i32 0, !"typeid"}
diff --git a/test/Transforms/WholeProgramDevirt/vcp-no-this.ll b/test/Transforms/WholeProgramDevirt/vcp-no-this.ll
index c564665471cf..ce76c8e6797e 100644
--- a/test/Transforms/WholeProgramDevirt/vcp-no-this.ll
+++ b/test/Transforms/WholeProgramDevirt/vcp-no-this.ll
@@ -3,8 +3,8 @@
 target datalayout = "e-p:64:64"
 target triple = "x86_64-unknown-linux-gnu"
 
-@vt1 = global [1 x i8*] [i8* bitcast (i32 ()* @vf1 to i8*)], !type !0
-@vt2 = global [1 x i8*] [i8* bitcast (i32 ()* @vf2 to i8*)], !type !0
+@vt1 = constant [1 x i8*] [i8* bitcast (i32 ()* @vf1 to i8*)], !type !0
+@vt2 = constant [1 x i8*] [i8* bitcast (i32 ()* @vf2 to i8*)], !type !0
 
 define i32 @vf1() readnone {
   ret i32 1
diff --git a/test/Transforms/WholeProgramDevirt/vcp-non-constant-arg.ll b/test/Transforms/WholeProgramDevirt/vcp-non-constant-arg.ll
index 197c923c3a1c..cc2ff33296a9 100644
--- a/test/Transforms/WholeProgramDevirt/vcp-non-constant-arg.ll
+++ b/test/Transforms/WholeProgramDevirt/vcp-non-constant-arg.ll
@@ -3,8 +3,8 @@
 target datalayout = "e-p:64:64"
 target triple = "x86_64-unknown-linux-gnu"
 
-@vt1 = global [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf1 to i8*)], !type !0
-@vt2 = global [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf2 to i8*)], !type !0
+@vt1 = constant [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf1 to i8*)], !type !0
+@vt2 = constant [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf2 to i8*)], !type !0
 
 define i32 @vf1(i8* %this, i32 %arg) readnone {
   ret i32 %arg
diff --git a/test/Transforms/WholeProgramDevirt/vcp-too-wide-ints.ll b/test/Transforms/WholeProgramDevirt/vcp-too-wide-ints.ll
index 93936d5e1d27..c24c3b4be683 100644
--- a/test/Transforms/WholeProgramDevirt/vcp-too-wide-ints.ll
+++ b/test/Transforms/WholeProgramDevirt/vcp-too-wide-ints.ll
@@ -3,33 +3,63 @@
 target datalayout = "e-p:64:64"
 target triple = "x86_64-unknown-linux-gnu"
 
-@vt1 = global [1 x i8*] [i8* bitcast (i128 (i8*, i128)* @vf1 to i8*)], !type !0
-@vt2 = global [1 x i8*] [i8* bitcast (i128 (i8*, i128)* @vf2 to i8*)], !type !0
+@vt1 = constant [1 x i8*] [i8* bitcast (i64 (i8*, i128)* @vf1 to i8*)], !type !0
+@vt2 = constant [1 x i8*] [i8* bitcast (i64 (i8*, i128)* @vf2 to i8*)], !type !0
+@vt3 = constant [1 x i8*] [i8* bitcast (i128 (i8*, i64)* @vf3 to i8*)], !type !1
+@vt4 = constant [1 x i8*] [i8* bitcast (i128 (i8*, i64)* @vf4 to i8*)], !type !1
 
-define i128 @vf1(i8* %this, i128 %arg) readnone {
-  ret i128 %arg
+define i64 @vf1(i8* %this, i128 %arg) readnone {
+  %argtrunc = trunc i128 %arg to i64
+  ret i64 %argtrunc
 }
 
-define i128 @vf2(i8* %this, i128 %arg) readnone {
-  ret i128 %arg
+define i64 @vf2(i8* %this, i128 %arg) readnone {
+  %argtrunc = trunc i128 %arg to i64
+  ret i64 %argtrunc
 }
 
-; CHECK: define i128 @call
-define i128 @call(i8* %obj) {
+define i128 @vf3(i8* %this, i64 %arg) readnone {
+  %argzext = zext i64 %arg to i128
+  ret i128 %argzext
+}
+
+define i128 @vf4(i8* %this, i64 %arg) readnone {
+  %argzext = zext i64 %arg to i128
+  ret i128 %argzext
+}
+
+; CHECK: define i64 @call1
+define i64 @call1(i8* %obj) {
+  %vtableptr = bitcast i8* %obj to [1 x i8*]**
+  %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
+  %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"typeid1")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr [1 x i8*], [1 x i8*]* %vtable, i32 0, i32 0
+  %fptr = load i8*, i8** %fptrptr
+  %fptr_casted = bitcast i8* %fptr to i64 (i8*, i128)*
+  ; CHECK: call i64 %
+  %result = call i64 %fptr_casted(i8* %obj, i128 1)
+  ret i64 %result
+}
+
+; CHECK: define i128 @call2
+define i128 @call2(i8* %obj) {
   %vtableptr = bitcast i8* %obj to [1 x i8*]**
   %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
   %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
-  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"typeid")
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"typeid2")
   call void @llvm.assume(i1 %p)
   %fptrptr = getelementptr [1 x i8*], [1 x i8*]* %vtable, i32 0, i32 0
   %fptr = load i8*, i8** %fptrptr
-  %fptr_casted = bitcast i8* %fptr to i128 (i8*, i128)*
+  %fptr_casted = bitcast i8* %fptr to i128 (i8*, i64)*
   ; CHECK: call i128 %
-  %result = call i128 %fptr_casted(i8* %obj, i128 1)
+  %result = call i128 %fptr_casted(i8* %obj, i64 1)
   ret i128 %result
 }
 
 declare i1 @llvm.type.test(i8*, metadata)
 declare void @llvm.assume(i1)
 
-!0 = !{i32 0, !"typeid"}
+!0 = !{i32 0, !"typeid1"}
+!1 = !{i32 0, !"typeid2"}
diff --git a/test/Transforms/WholeProgramDevirt/vcp-type-mismatch.ll b/test/Transforms/WholeProgramDevirt/vcp-type-mismatch.ll
index 3124889a7070..7016263f8f7b 100644
--- a/test/Transforms/WholeProgramDevirt/vcp-type-mismatch.ll
+++ b/test/Transforms/WholeProgramDevirt/vcp-type-mismatch.ll
@@ -1,10 +1,16 @@
 ; RUN: opt -S -wholeprogramdevirt %s | FileCheck %s
 
+; Test that we correctly handle function type mismatches in argument counts
+; and bitwidths. We handle an argument count mismatch by refusing
+; to optimize. For bitwidth mismatches, we allow the optimization in order
+; to simplify the implementation. This is legal because the bitwidth mismatch
+; gives the call undefined behavior.
+
 target datalayout = "e-p:64:64"
 target triple = "x86_64-unknown-linux-gnu"
 
-@vt1 = global [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf1 to i8*)], !type !0
-@vt2 = global [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf2 to i8*)], !type !0
+@vt1 = constant [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf1 to i8*)], !type !0
+@vt2 = constant [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf2 to i8*)], !type !0
 
 define i32 @vf1(i8* %this, i32 %arg) readnone {
   ret i32 %arg
@@ -24,8 +30,8 @@ define i32 @bad_arg_type(i8* %obj) {
   %fptrptr = getelementptr [1 x i8*], [1 x i8*]* %vtable, i32 0, i32 0
   %fptr = load i8*, i8** %fptrptr
   %fptr_casted = bitcast i8* %fptr to i32 (i8*, i64)*
-  ; CHECK: call i32 %
   %result = call i32 %fptr_casted(i8* %obj, i64 1)
+  ; CHECK: ret i32 1
   ret i32 %result
 }
 
@@ -54,8 +60,8 @@ define i64 @bad_return_type(i8* %obj) {
   %fptrptr = getelementptr [1 x i8*], [1 x i8*]* %vtable, i32 0, i32 0
   %fptr = load i8*, i8** %fptrptr
   %fptr_casted = bitcast i8* %fptr to i64 (i8*, i32)*
-  ; CHECK: call i64 %
   %result = call i64 %fptr_casted(i8* %obj, i32 1)
+  ; CHECK: ret i64 1
   ret i64 %result
 }
 
diff --git a/test/Transforms/WholeProgramDevirt/vcp-uses-this.ll b/test/Transforms/WholeProgramDevirt/vcp-uses-this.ll
index fc4dee37dba7..542402e16577 100644
--- a/test/Transforms/WholeProgramDevirt/vcp-uses-this.ll
+++ b/test/Transforms/WholeProgramDevirt/vcp-uses-this.ll
@@ -3,8 +3,8 @@
 target datalayout = "e-p:64:64"
 target triple = "x86_64-unknown-linux-gnu"
 
-@vt1 = global [1 x i8*] [i8* bitcast (i32 (i8*)* @vf1 to i8*)], !type !0
-@vt2 = global [1 x i8*] [i8* bitcast (i32 (i8*)* @vf2 to i8*)], !type !0
+@vt1 = constant [1 x i8*] [i8* bitcast (i32 (i8*)* @vf1 to i8*)], !type !0
+@vt2 = constant [1 x i8*] [i8* bitcast (i32 (i8*)* @vf2 to i8*)], !type !0
 
 define i32 @vf1(i8* %this) readnone {
   %this_int = ptrtoint i8* %this to i32
diff --git a/test/Transforms/WholeProgramDevirt/virtual-const-prop-begin.ll b/test/Transforms/WholeProgramDevirt/virtual-const-prop-begin.ll
index 530fe8aa89d0..080ed6caac5e 100644
--- a/test/Transforms/WholeProgramDevirt/virtual-const-prop-begin.ll
+++ b/test/Transforms/WholeProgramDevirt/virtual-const-prop-begin.ll
@@ -78,7 +78,7 @@ define i1 @call1(i8* %obj) {
   %fptrptr = getelementptr [3 x i8*], [3 x i8*]* %vtable, i32 0, i32 0
   %fptr = load i8*, i8** %fptrptr
   %fptr_casted = bitcast i8* %fptr to i1 (i8*)*
-  ; CHECK: [[VTGEP1:%[^ ]*]] = getelementptr i8, i8* [[VT1]], i64 -5
+  ; CHECK: [[VTGEP1:%[^ ]*]] = getelementptr i8, i8* [[VT1]], i32 -5
   ; CHECK: [[VTLOAD1:%[^ ]*]] = load i8, i8* [[VTGEP1]]
   ; CHECK: [[VTAND1:%[^ ]*]] = and i8 [[VTLOAD1]], 2
   ; CHECK: [[VTCMP1:%[^ ]*]] = icmp ne i8 [[VTAND1]], 0
@@ -98,7 +98,7 @@ define i1 @call2(i8* %obj) {
   %fptrptr = getelementptr [3 x i8*], [3 x i8*]* %vtable, i32 0, i32 1
   %fptr = load i8*, i8** %fptrptr
   %fptr_casted = bitcast i8* %fptr to i1 (i8*)*
-  ; CHECK: [[VTGEP2:%[^ ]*]] = getelementptr i8, i8* [[VT2]], i64 -5
+  ; CHECK: [[VTGEP2:%[^ ]*]] = getelementptr i8, i8* [[VT2]], i32 -5
   ; CHECK: [[VTLOAD2:%[^ ]*]] = load i8, i8* [[VTGEP2]]
   ; CHECK: [[VTAND2:%[^ ]*]] = and i8 [[VTLOAD2]], 1
   ; CHECK: [[VTCMP2:%[^ ]*]] = icmp ne i8 [[VTAND2]], 0
@@ -118,7 +118,7 @@ define i32 @call3(i8* %obj) {
   %fptrptr = getelementptr [3 x i8*], [3 x i8*]* %vtable, i32 0, i32 2
   %fptr = load i8*, i8** %fptrptr
   %fptr_casted = bitcast i8* %fptr to i32 (i8*)*
-  ; CHECK: [[VTGEP3:%[^ ]*]] = getelementptr i8, i8* [[VT3]], i64 -4
+  ; CHECK: [[VTGEP3:%[^ ]*]] = getelementptr i8, i8* [[VT3]], i32 -4
   ; CHECK: [[VTBC3:%[^ ]*]] = bitcast i8* [[VTGEP3]] to i32*
   ; CHECK: [[VTLOAD3:%[^ ]*]] = load i32, i32* [[VTBC3]]
   %result = call i32 %fptr_casted(i8* %obj)
diff --git a/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll b/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll
index fcf00d6d86c2..3299f7bce65b 100644
--- a/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll
+++ b/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll
@@ -87,7 +87,7 @@ define i1 @call1(i8* %obj) {
   %pair = call {i8*, i1} @llvm.type.checked.load(i8* %vtablei8, i32 0, metadata !"typeid")
   %fptr = extractvalue {i8*, i1} %pair, 0
   %fptr_casted = bitcast i8* %fptr to i1 (i8*)*
-  ; CHECK: [[VTGEP1:%[^ ]*]] = getelementptr i8, i8* [[VT1]], i64 -5
+  ; CHECK: [[VTGEP1:%[^ ]*]] = getelementptr i8, i8* [[VT1]], i32 -5
   ; CHECK: [[VTLOAD1:%[^ ]*]] = load i8, i8* [[VTGEP1]]
   ; CHECK: [[VTAND1:%[^ ]*]] = and i8 [[VTLOAD1]], 2
   ; CHECK: [[VTCMP1:%[^ ]*]] = icmp ne i8 [[VTAND1]], 0
@@ -108,7 +108,7 @@ define i1 @call2(i8* %obj) {
   %pair = call {i8*, i1} @llvm.type.checked.load(i8* %vtablei8, i32 8, metadata !"typeid")
   %fptr = extractvalue {i8*, i1} %pair, 0
   %fptr_casted = bitcast i8* %fptr to i1 (i8*)*
-  ; CHECK: [[VTGEP2:%[^ ]*]] = getelementptr i8, i8* [[VT2]], i64 -5
+  ; CHECK: [[VTGEP2:%[^ ]*]] = getelementptr i8, i8* [[VT2]], i32 -5
   ; CHECK: [[VTLOAD2:%[^ ]*]] = load i8, i8* [[VTGEP2]]
   ; CHECK: [[VTAND2:%[^ ]*]] = and i8 [[VTLOAD2]], 1
   ; CHECK: [[VTCMP2:%[^ ]*]] = icmp ne i8 [[VTAND2]], 0
@@ -129,7 +129,7 @@ define i32 @call3(i8* %obj) {
   %pair = call {i8*, i1} @llvm.type.checked.load(i8* %vtablei8, i32 16, metadata !"typeid")
   %fptr = extractvalue {i8*, i1} %pair, 0
   %fptr_casted = bitcast i8* %fptr to i32 (i8*)*
-  ; CHECK: [[VTGEP3:%[^ ]*]] = getelementptr i8, i8* [[VT3]], i64 -4
+  ; CHECK: [[VTGEP3:%[^ ]*]] = getelementptr i8, i8* [[VT3]], i32 -4
   ; CHECK: [[VTBC3:%[^ ]*]] = bitcast i8* [[VTGEP3]] to i32*
   ; CHECK: [[VTLOAD3:%[^ ]*]] = load i32, i32* [[VTBC3]]
   %result = call i32 %fptr_casted(i8* %obj)
diff --git a/test/Transforms/WholeProgramDevirt/virtual-const-prop-end.ll b/test/Transforms/WholeProgramDevirt/virtual-const-prop-end.ll
index 75ec6ba95ef1..14360c78d950 100644
--- a/test/Transforms/WholeProgramDevirt/virtual-const-prop-end.ll
+++ b/test/Transforms/WholeProgramDevirt/virtual-const-prop-end.ll
@@ -73,7 +73,7 @@ define i1 @call1(i8* %obj) {
   %fptrptr = getelementptr [3 x i8*], [3 x i8*]* %vtable, i32 0, i32 0
   %fptr = load i8*, i8** %fptrptr
   %fptr_casted = bitcast i8* %fptr to i1 (i8*)*
-  ; CHECK: [[VTGEP1:%[^ ]*]] = getelementptr i8, i8* [[VT1]], i64 28
+  ; CHECK: [[VTGEP1:%[^ ]*]] = getelementptr i8, i8* [[VT1]], i32 28
   ; CHECK: [[VTLOAD1:%[^ ]*]] = load i8, i8* [[VTGEP1]]
   ; CHECK: [[VTAND1:%[^ ]*]] = and i8 [[VTLOAD1]], 2
   ; CHECK: [[VTCMP1:%[^ ]*]] = icmp ne i8 [[VTAND1]], 0
@@ -93,7 +93,7 @@ define i1 @call2(i8* %obj) {
   %fptrptr = getelementptr [3 x i8*], [3 x i8*]* %vtable, i32 0, i32 1
   %fptr = load i8*, i8** %fptrptr
   %fptr_casted = bitcast i8* %fptr to i1 (i8*)*
-  ; CHECK: [[VTGEP2:%[^ ]*]] = getelementptr i8, i8* [[VT2]], i64 28
+  ; CHECK: [[VTGEP2:%[^ ]*]] = getelementptr i8, i8* [[VT2]], i32 28
   ; CHECK: [[VTLOAD2:%[^ ]*]] = load i8, i8* [[VTGEP2]]
   ; CHECK: [[VTAND2:%[^ ]*]] = and i8 [[VTLOAD2]], 1
   ; CHECK: [[VTCMP2:%[^ ]*]] = icmp ne i8 [[VTAND2]], 0
@@ -113,7 +113,7 @@ define i32 @call3(i8* %obj) {
   %fptrptr = getelementptr [3 x i8*], [3 x i8*]* %vtable, i32 0, i32 2
   %fptr = load i8*, i8** %fptrptr
   %fptr_casted = bitcast i8* %fptr to i32 (i8*)*
-  ; CHECK: [[VTGEP3:%[^ ]*]] = getelementptr i8, i8* [[VT3]], i64 24
+  ; CHECK: [[VTGEP3:%[^ ]*]] = getelementptr i8, i8* [[VT3]], i32 24
   ; CHECK: [[VTBC3:%[^ ]*]] = bitcast i8* [[VTGEP3]] to i32*
   ; CHECK: [[VTLOAD3:%[^ ]*]] = load i32, i32* [[VTBC3]]
   %result = call i32 %fptr_casted(i8* %obj)