16 files changed, 749 insertions, 82 deletions
diff --git a/test/Transforms/LICM/atomics.ll b/test/Transforms/LICM/atomics.ll
index d23cb49c5486..15c461aeca27 100644
--- a/test/Transforms/LICM/atomics.ll
+++ b/test/Transforms/LICM/atomics.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -S -basicaa -licm | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='lcssa,require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
 
 ; Check that we can hoist unordered loads
 define i32 @test1(i32* nocapture %y) nounwind uwtable ssp {
@@ -60,8 +60,7 @@ end:
 ; CHECK-NEXT: br label %loop
 }
 
-; Don't try to "sink" unordered stores yet; it is legal, but the machinery
-; isn't there.
+; We can sink an unordered store
 define i32 @test4(i32* nocapture noalias %x, i32* nocapture %y) nounwind uwtable ssp {
 entry:
   br label %loop
@@ -75,6 +74,149 @@ loop:
 end:
   ret i32 %vala
 ; CHECK-LABEL: define i32 @test4(
+; CHECK-LABEL: loop:
+; CHECK: load atomic i32, i32* %y monotonic
+; CHECK-NOT: store
+; CHECK-LABEL: end:
+; CHECK-NEXT:   %[[LCSSAPHI:.*]] = phi i32 [ %vala
+; CHECK:   store atomic i32 %[[LCSSAPHI]], i32* %x unordered, align 4
+}
+
+; We currently don't handle ordered atomics.
+define i32 @test5(i32* nocapture noalias %x, i32* nocapture %y) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic i32 %vala, i32* %x release, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test5(
 ; CHECK: load atomic i32, i32* %y monotonic
 ; CHECK-NEXT: store atomic
 }
+
+; We currently don't touch volatiles
+define i32 @test6(i32* nocapture noalias %x, i32* nocapture %y) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store volatile i32 %vala, i32* %x, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test6(
+; CHECK: load atomic i32, i32* %y monotonic
+; CHECK-NEXT: store volatile
+}
+
+; We currently don't touch volatiles
+define i32 @test6b(i32* nocapture noalias %x, i32* nocapture %y) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic volatile i32 %vala, i32* %x unordered, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test6b(
+; CHECK: load atomic i32, i32* %y monotonic
+; CHECK-NEXT: store atomic volatile
+}
+
+; Mixing unorder atomics and normal loads/stores is
+; current unimplemented
+define i32 @test7(i32* nocapture noalias %x, i32* nocapture %y) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  store i32 5, i32* %x
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic i32 %vala, i32* %x unordered, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test7(
+; CHECK: store i32 5, i32* %x
+; CHECK-NEXT: load atomic i32, i32* %y
+; CHECK-NEXT: store atomic i32
+}
+
+; Three provably noalias locations - we can sink normal and unordered, but
+;  not monotonic
+define i32 @test7b(i32* nocapture noalias %x, i32* nocapture %y, i32* noalias nocapture %z) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  store i32 5, i32* %x
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic i32 %vala, i32* %z unordered, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test7b(
+; CHECK: load atomic i32, i32* %y monotonic
+
+; CHECK-LABEL: end:
+; CHECK: store i32 5, i32* %x
+; CHECK: store atomic i32 %{{.+}}, i32* %z unordered, align 4
+}
+
+
+define i32 @test8(i32* nocapture noalias %x, i32* nocapture %y) {
+entry:
+  br label %loop
+
+loop:
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic i32 %vala, i32* %x unordered, align 4
+  fence release
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test8(
+; CHECK-LABEL: loop:
+; CHECK: load atomic i32, i32* %y monotonic
+; CHECK-NEXT: store atomic
+; CHECK-NEXT: fence
+}
+
+; Exact semantics of monotonic accesses are a bit vague in the C++ spec,
+; for the moment, be conservative and don't touch them.
+define i32 @test9(i32* nocapture noalias %x, i32* nocapture %y) {
+entry:
+  br label %loop
+
+loop:
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic i32 %vala, i32* %x monotonic, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test9(
+; CHECK-LABEL: loop:
+; CHECK: load atomic i32, i32* %y monotonic
+; CHECK-NEXT:   store atomic i32 %vala, i32* %x monotonic, align 4
+}
diff --git a/test/Transforms/LICM/constexpr.ll b/test/Transforms/LICM/constexpr.ll
index 8ffc73513600..488821ac8fd4 100644
--- a/test/Transforms/LICM/constexpr.ll
+++ b/test/Transforms/LICM/constexpr.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -S -basicaa -licm | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='lcssa,require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
 ; This fixes PR22460
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/test/Transforms/LICM/hoist-bitcast-load.ll b/test/Transforms/LICM/hoist-bitcast-load.ll
index 6ef00738820e..956c7283be31 100644
--- a/test/Transforms/LICM/hoist-bitcast-load.ll
+++ b/test/Transforms/LICM/hoist-bitcast-load.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -S -basicaa -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='loop-simplify,require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(simplify-cfg,licm)' -S < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(simplify-cfg,licm)' -S < %s | FileCheck %s
+
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Transforms/LICM/hoist-deref-load.ll b/test/Transforms/LICM/hoist-deref-load.ll
index e67becdeb5e4..b48c9e5c7b14 100644
--- a/test/Transforms/LICM/hoist-deref-load.ll
+++ b/test/Transforms/LICM/hoist-deref-load.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -S -basicaa -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='loop-simplify,require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(simplify-cfg,licm)' -S < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(simplify-cfg,licm)' -S < %s | FileCheck %s
+
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Transforms/LICM/hoist-fast-fdiv.ll b/test/Transforms/LICM/hoist-fast-fdiv.ll
new file mode 100644
index 000000000000..f61564fd726c
--- /dev/null
+++ b/test/Transforms/LICM/hoist-fast-fdiv.ll
@@ -0,0 +1,34 @@
+; RUN: opt -licm -S < %s | FileCheck %s
+
+; Function Attrs: noinline norecurse nounwind readnone ssp uwtable
+define zeroext i1 @f(double %v) #0 {
+entry:
+; CHECK-LABEL: @f(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: fdiv fast double 1.000000e+00, %v
+  br label %loop
+
+loop:                                       ; preds = %entry, %loop
+  %v3 = phi i32 [ 0, %entry ], [ %v11, %loop ]
+  %v4 = phi i32 [ 0, %entry ], [ %v12, %loop ]
+  %v5 = uitofp i32 %v4 to double
+
+; CHECK-LABEL: loop:
+; CHECK: fmul fast double
+; CHECK-NOT: fdiv
+  %v6 = fdiv fast double %v5, %v
+  %v7 = fptoui double %v6 to i64
+  %v8 = and i64 %v7, 1
+  %v9 = xor i64 %v8, 1
+  %v10 = trunc i64 %v9 to i32
+  %v11 = add i32 %v10, %v3
+  %v12 = add nuw i32 %v4, 1
+  %v13 = icmp eq i32 %v12, -1
+  br i1 %v13, label %end, label %loop
+
+end:                                      ; preds = %loop
+  %v15 = phi i32 [ %v11, %loop ]
+  %v16 = icmp ne i32 %v15, 0
+  ret i1 %v16
+}
+
diff --git a/test/Transforms/LICM/hoist-nounwind.ll b/test/Transforms/LICM/hoist-nounwind.ll
index e9720235893a..9fc4903b8302 100644
--- a/test/Transforms/LICM/hoist-nounwind.ll
+++ b/test/Transforms/LICM/hoist-nounwind.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -basicaa -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='lcssa,require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Transforms/LICM/hoist-round.ll b/test/Transforms/LICM/hoist-round.ll
index 9c6a3a180b50..87a7050668de 100644
--- a/test/Transforms/LICM/hoist-round.ll
+++ b/test/Transforms/LICM/hoist-round.ll
@@ -18,6 +18,7 @@ target datalayout = "E-m:e-p:32:32-i8:8:8-i16:16:16-i64:32:32-f64:32:32-v64:32:3
 ; CHECK: call float @llvm.copysign.f32
 ; CHECK: call float @llvm.minnum.f32
 ; CHECK: call float @llvm.maxnum.f32
+; CHECK: call float @llvm.powi.f32
 ; CHECK: for.body:
 
 define void @test(float %arg1, float %arg2) {
@@ -40,7 +41,8 @@ for.body:
   %tmp.8 = call float @llvm.copysign.f32(float %tmp.7, float %arg2)
   %tmp.9 = call float @llvm.minnum.f32(float %tmp.8, float %arg2)
   %tmp.10 = call float @llvm.maxnum.f32(float %tmp.9, float %arg2)
-  call void @consume(float %tmp.10)
+  %tmp.11 = call float @llvm.powi.f32(float %tmp.10, i32 4)
+  call void @consume(float %tmp.11)
   %IND.new = add i32 %IND, 1
   br label %for.head
 
@@ -60,3 +62,4 @@ declare float @llvm.fabs.f32(float)
 declare float @llvm.copysign.f32(float, float)
 declare float @llvm.minnum.f32(float, float)
 declare float @llvm.maxnum.f32(float, float)
+declare float @llvm.powi.f32(float, i32)
diff --git a/test/Transforms/LICM/hoisting.ll b/test/Transforms/LICM/hoisting.ll
index 29595b3e1cc0..cbd17689e939 100644
--- a/test/Transforms/LICM/hoisting.ll
+++ b/test/Transforms/LICM/hoisting.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -licm -S | FileCheck %s
-; RUN: opt -lcssa %s | opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S | FileCheck %s
+; RUN: opt < %s -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' -S | FileCheck %s
 
 @X = global i32 0		; <i32*> [#uses=1]
 
@@ -149,3 +149,174 @@ latch:
 return:
   ret i32 %sum
 }
+
+declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) nounwind readonly
+declare void @llvm.invariant.end.p0i8({}*, i64, i8* nocapture) nounwind
+declare void @escaping.invariant.start({}*) nounwind
+; invariant.start dominates the load, and in this scope, the
+; load is invariant. So, we can hoist the `addrld` load out of the loop.
+define i32 @test_fence(i8* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence
+; CHECK-LABEL: entry
+; CHECK: invariant.start
+; CHECK: %addrld = load atomic i32, i32* %addr.i unordered, align 8
+; CHECK: br label %loop
+entry: 
+  %gep = getelementptr inbounds i8, i8* %addr, i64 8
+  %addr.i = bitcast i8* %gep to i32 *
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}
+
+
+
+; Same as test above, but the load is no longer invariant (presence of
+; invariant.end). We cannot hoist the addrld out of loop.
+define i32 @test_fence1(i8* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence1
+; CHECK-LABEL: entry
+; CHECK: invariant.start
+; CHECK-NEXT: invariant.end
+; CHECK-NEXT: br label %loop
+entry:
+  %gep = getelementptr inbounds i8, i8* %addr, i64 8
+  %addr.i = bitcast i8* %gep to i32 *
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
+  call void @llvm.invariant.end.p0i8({}* %invst, i64 4, i8* %gep)
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}
+
+; same as test above, but instead of invariant.end, we have the result of
+; invariant.start escaping through a call. We cannot hoist the load.
+define i32 @test_fence2(i8* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence2
+; CHECK-LABEL: entry
+; CHECK-NOT: load
+; CHECK: br label %loop
+entry:
+  %gep = getelementptr inbounds i8, i8* %addr, i64 8
+  %addr.i = bitcast i8* %gep to i32 *
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
+  call void @escaping.invariant.start({}* %invst)
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}
+
+; FIXME: invariant.start dominates the load, and in this scope, the
+; load is invariant. So, we can hoist the `addrld` load out of the loop.
+; Consider the loadoperand addr.i bitcasted before being passed to
+; invariant.start
+define i32 @test_fence3(i32* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence3
+; CHECK-LABEL: entry
+; CHECK: invariant.start
+; CHECK-NOT: %addrld = load atomic i32, i32* %addr.i unordered, align 8
+; CHECK: br label %loop
+entry: 
+  %addr.i = getelementptr inbounds i32, i32* %addr, i64 8
+  %gep = bitcast i32* %addr.i to i8 *
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}
+
+; We should not hoist the addrld out of the loop.
+define i32 @test_fence4(i32* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence4
+; CHECK-LABEL: entry
+; CHECK-NOT: %addrld = load atomic i32, i32* %addr.i unordered, align 8
+; CHECK: br label %loop
+entry: 
+  %addr.i = getelementptr inbounds i32, i32* %addr, i64 8
+  %gep = bitcast i32* %addr.i to i8 *
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}
diff --git a/test/Transforms/LICM/loopsink.ll b/test/Transforms/LICM/loopsink.ll
index 5004752d1031..b203ea8b51ad 100644
--- a/test/Transforms/LICM/loopsink.ll
+++ b/test/Transforms/LICM/loopsink.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-sink < %s | FileCheck %s
+; RUN: opt -S -passes=loop-sink < %s | FileCheck %s
 
 @g = global i32 0, align 4
 
diff --git a/test/Transforms/LICM/opt-remarks.ll b/test/Transforms/LICM/opt-remarks.ll
index f0ef386c9f9a..b44fc57131a5 100644
--- a/test/Transforms/LICM/opt-remarks.ll
+++ b/test/Transforms/LICM/opt-remarks.ll
@@ -10,7 +10,7 @@ Loop:
   %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]
   %addr = getelementptr i32, i32* %array, i32 %j
   %a = load i32, i32* %addr
-; CHECK: remark: /tmp/kk.c:2:20: hosting load
+; CHECK: remark: /tmp/kk.c:2:20: hoisting load
   %b = load i32, i32* %p, !dbg !8
   %a2 = add i32 %a, %b
   store i32 %a2, i32* %addr
diff --git a/test/Transforms/LICM/pr32129.ll b/test/Transforms/LICM/pr32129.ll
new file mode 100644
index 000000000000..2618afe46322
--- /dev/null
+++ b/test/Transforms/LICM/pr32129.ll
@@ -0,0 +1,18 @@
+; RUN: opt -S -licm -loop-unswitch -licm < %s | FileCheck %s
+
+declare void @llvm.experimental.guard(i1, ...)
+
+define void @test() {
+; CHECK-LABEL: @test(
+; CHECK-NOT: guard
+entry:
+  br label %header
+
+header:
+  br label %loop
+
+loop:
+  %0 = icmp ult i32 0, 400
+  call void (i1, ...) @llvm.experimental.guard(i1 %0, i32 9) [ "deopt"() ]
+  br i1 undef, label %header, label %loop
+}
diff --git a/test/Transforms/LICM/scalar-promote-unwind.ll b/test/Transforms/LICM/scalar-promote-unwind.ll
new file mode 100644
index 000000000000..f1f52eed1d4c
--- /dev/null
+++ b/test/Transforms/LICM/scalar-promote-unwind.ll
@@ -0,0 +1,263 @@
+; RUN: opt < %s -basicaa -licm -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Make sure we don't hoist the store out of the loop; %a would
+; have the wrong value if f() unwinds
+
+define void @test1(i32* nocapture noalias %a, i1 zeroext %y) uwtable {
+entry:
+  br label %for.body
+
+for.body:
+  %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %0 = load i32, i32* %a, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* %a, align 4
+  br i1 %y, label %if.then, label %for.inc
+
+; CHECK: define void @test1
+; CHECK: load i32, i32*
+; CHECK-NEXT: add
+; CHECK-NEXT: store i32
+
+if.then:
+  tail call void @f()
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, 10000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+; We can hoist the store out of the loop here; if f() unwinds,
+; the lifetime of %a ends.
+
+define void @test2(i1 zeroext %y) uwtable {
+entry:
+  %a = alloca i32
+  br label %for.body
+
+for.body:
+  %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %0 = load i32, i32* %a, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* %a, align 4
+  br i1 %y, label %if.then, label %for.inc
+
+if.then:
+  tail call void @f()
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, 10000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+; CHECK: define void @test2
+; CHECK: store i32
+; CHECK-NEXT: ret void
+  ret void
+}
+
+@_ZTIi = external constant i8*
+
+; In this test, the loop is within a try block. There is an explicit unwind edge out of the loop.
+; Make sure this edge is treated as a loop exit, and that the loads and stores are promoted as
+; expected
+define void @loop_within_tryblock() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  %a = alloca i32, align 4
+  store i32 0, i32* %a, align 4
+  br label %for.cond
+
+for.cond:
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+; CHECK: for.body:
+; CHECK-NOT: load
+; CHECK-NOT: store 
+; CHECK: invoke
+for.body:
+  %0 = load i32, i32* %a, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* %a, align 4
+  invoke void @boo()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+; CHECK: lpad:
+; CHECK: store
+; CHECK: br
+lpad:
+  %1 = landingpad { i8*, i32 }
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %2 = extractvalue { i8*, i32 } %1, 0
+  %3 = extractvalue { i8*, i32 } %1, 1
+  br label %catch.dispatch
+
+catch.dispatch:
+  %4 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) #3
+  %matches = icmp eq i32 %3, %4
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:
+  %5 = call i8* @__cxa_begin_catch(i8* %2) #3
+  %6 = bitcast i8* %5 to i32*
+  %7 = load i32, i32* %6, align 4
+  call void @__cxa_end_catch() #3
+  br label %try.cont
+
+try.cont:
+  ret void
+
+for.end:
+  br label %try.cont
+
+eh.resume:
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %2, 0
+  %lpad.val3 = insertvalue { i8*, i32 } %lpad.val, i32 %3, 1
+  resume { i8*, i32 } %lpad.val3
+}
+
+
+; The malloc'ed memory is not capture and therefore promoted.
+define void @malloc_no_capture() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  %call = call i8* @malloc(i64 4)
+  %0 = bitcast i8* %call to i32*
+  br label %for.body
+
+; CHECK: for.body:
+; CHECK-NOT: load
+; CHECK-NOT: store
+; CHECK: br 
+for.body:
+  %i.0 = phi i32 [ 0, %entry  ], [ %inc, %for.latch ]
+  %1 = load i32, i32* %0, align 4
+  %add = add nsw i32 %1, 1
+  store i32 %add, i32* %0, align 4
+  br label %for.call
+
+for.call:
+  invoke void @boo()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  br label %for.latch
+
+for.latch:
+  %inc = add i32 %i.0, 1
+  %cmp = icmp slt i32 %i.0, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  br label %fun.ret
+
+lpad:
+  %2 = landingpad { i8*, i32 }
+          catch i8* null
+  %3 = extractvalue { i8*, i32 } %2, 0
+  %4 = extractvalue { i8*, i32 } %2, 1
+  br label %catch
+
+catch:
+  %5 = call i8* @__cxa_begin_catch(i8* %3) #4
+  %6 = bitcast i32* %0 to i8*
+  call void @free(i8* %6)
+  call void @__cxa_end_catch()
+  br label %fun.ret
+
+fun.ret:
+  ret void
+}
+
+; The malloc'ed memory can be captured and therefore not promoted.
+define void @malloc_capture(i32** noalias %A) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  %call = call i8* @malloc(i64 4)
+  %0 = bitcast i8* %call to i32*
+  br label %for.body
+
+; CHECK: for.body:
+; CHECK: load
+; CHECK: store
+; CHECK: br 
+for.body:
+  %i.0 = phi i32 [ 0, %entry  ], [ %inc, %for.latch ]
+  %1 = load i32, i32* %0, align 4
+  %add = add nsw i32 %1, 1
+  store i32 %add, i32* %0, align 4
+  br label %for.call
+
+for.call:
+  invoke void @boo_readnone()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  br label %for.latch
+
+for.latch:
+  store i32* %0, i32** %A 
+  %inc = add i32 %i.0, 1
+  %cmp = icmp slt i32 %i.0, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  br label %fun.ret
+
+lpad:
+  %2 = landingpad { i8*, i32 }
+          catch i8* null
+  %3 = extractvalue { i8*, i32 } %2, 0
+  %4 = extractvalue { i8*, i32 } %2, 1
+  br label %catch
+
+catch:
+  %5 = call i8* @__cxa_begin_catch(i8* %3) #4
+  %6 = bitcast i32* %0 to i8*
+  call void @free(i8* %6)
+  call void @__cxa_end_catch()
+  br label %fun.ret
+
+fun.ret:
+  ret void
+}
+
+; Function Attrs: nounwind
+declare noalias i8* @malloc(i64)
+
+; Function Attrs: nounwind
+declare void @free(i8* nocapture)
+
+declare void @boo() 
+
+; This is an artifical example, readnone functions by definition cannot unwind
+; exceptions by calling the C++ exception throwing methods
+; This function should only be used to test malloc_capture.
+declare void @boo_readnone() readnone
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+declare i32 @llvm.eh.typeid.for(i8*)
+
+declare void @f() uwtable
diff --git a/test/Transforms/LICM/scalar_promote.ll b/test/Transforms/LICM/scalar-promote.ll
index c88701154b8f..89888546494f 100644
--- a/test/Transforms/LICM/scalar_promote.ll
+++ b/test/Transforms/LICM/scalar-promote.ll
@@ -378,6 +378,33 @@ exit:
   ret i32 %ret
 }
 
+define void @test10(i32 %i) {
+Entry:
+  br label %Loop
+; CHECK-LABEL: @test10(
+; CHECK: Entry:
+; CHECK-NEXT:   load atomic i32, i32* @X unordered, align 4
+; CHECK-NEXT:   br label %Loop
+
+
+Loop:   ; preds = %Loop, %0
+  %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]    ; <i32> [#uses=1]
+  %x = load atomic i32, i32* @X unordered, align 4
+  %x2 = add i32 %x, 1
+  store atomic i32 %x2, i32* @X unordered, align 4
+  %Next = add i32 %j, 1
+  %cond = icmp eq i32 %Next, 0
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  ret void
+; CHECK: Out:
+; CHECK-NEXT:   %[[LCSSAPHI:.*]] = phi i32 [ %x2
+; CHECK-NEXT:   store atomic i32 %[[LCSSAPHI]], i32* @X unordered, align 4
+; CHECK-NEXT:   ret void
+
+}
+
 !0 = !{!4, !4, i64 0}
 !1 = !{!"omnipotent char", !2}
 !2 = !{!"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LICM/scalar_promote-unwind.ll b/test/Transforms/LICM/scalar_promote-unwind.ll
deleted file mode 100644
index dd3693b4af63..000000000000
--- a/test/Transforms/LICM/scalar_promote-unwind.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; RUN: opt < %s -basicaa -licm -S | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Make sure we don't hoist the store out of the loop; %a would
-; have the wrong value if f() unwinds
-
-define void @test1(i32* nocapture noalias %a, i1 zeroext %y) uwtable {
-entry:
-  br label %for.body
-
-for.body:
-  %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
-  %0 = load i32, i32* %a, align 4
-  %add = add nsw i32 %0, 1
-  store i32 %add, i32* %a, align 4
-  br i1 %y, label %if.then, label %for.inc
-
-; CHECK: define void @test1
-; CHECK: load i32, i32*
-; CHECK-NEXT: add
-; CHECK-NEXT: store i32
-
-if.then:
-  tail call void @f()
-  br label %for.inc
-
-for.inc:
-  %inc = add nuw nsw i32 %i.03, 1
-  %exitcond = icmp eq i32 %inc, 10000
-  br i1 %exitcond, label %for.cond.cleanup, label %for.body
-
-for.cond.cleanup:
-  ret void
-}
-
-; We can hoist the store out of the loop here; if f() unwinds,
-; the lifetime of %a ends.
-
-define void @test2(i1 zeroext %y) uwtable {
-entry:
-  %a = alloca i32
-  br label %for.body
-
-for.body:
-  %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
-  %0 = load i32, i32* %a, align 4
-  %add = add nsw i32 %0, 1
-  store i32 %add, i32* %a, align 4
-  br i1 %y, label %if.then, label %for.inc
-
-if.then:
-  tail call void @f()
-  br label %for.inc
-
-for.inc:
-  %inc = add nuw nsw i32 %i.03, 1
-  %exitcond = icmp eq i32 %inc, 10000
-  br i1 %exitcond, label %for.cond.cleanup, label %for.body
-
-for.cond.cleanup:
-  ret void
-
-; CHECK: define void @test2
-; CHECK: store i32
-; CHECK-NEXT: ret void
-  ret void
-}
-
-declare void @f() uwtable
diff --git a/test/Transforms/LICM/sink.ll b/test/Transforms/LICM/sink.ll
index cf169ddc12a9..70fa6fa13e3e 100644
--- a/test/Transforms/LICM/sink.ll
+++ b/test/Transforms/LICM/sink.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -S -licm < %s | FileCheck %s --check-prefix=CHECK-LICM
 ; RUN: opt -S -licm < %s | opt -S -loop-sink | FileCheck %s --check-prefix=CHECK-SINK
+; RUN: opt -S < %s -passes='require<opt-remark-emit>,loop(licm),loop-sink' \
+; RUN:     | FileCheck %s --check-prefix=CHECK-SINK
 
 ; Original source code:
 ; int g;
diff --git a/test/Transforms/LICM/unrolled-deeply-nested.ll b/test/Transforms/LICM/unrolled-deeply-nested.ll
new file mode 100644
index 000000000000..c0f2c9818000
--- /dev/null
+++ b/test/Transforms/LICM/unrolled-deeply-nested.ll
@@ -0,0 +1,76 @@
+; Test that LICM correctly detects conflicting accesses to memory in deeply
+; nested subloops. This works in the legacy PM due to a special retained map of
+; alias information for inner loops, and in the new PM it is recomputed for each
+; loop.
+;
+; RUN: opt -S -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' < %s | FileCheck %s
+; RUN: opt -S -basicaa -licm < %s | FileCheck %s
+
+define i32 @test(i32* %a, i64 %n.0, i64 %n.0.0, i64 %n.0.0.0, i64 %n.0.0.0.0) nounwind uwtable readonly {
+; CHECK-LABEL: define i32 @test
+entry:
+  %b = alloca i32
+  %c = alloca i32
+  %a.i8 = bitcast i32* %a to i8*
+  %b.i8 = bitcast i32* %b to i8*
+  %c.i8 = bitcast i32* %c to i8*
+  br label %l.0.header
+; CHECK: %b = alloca i32
+; CHECK: %c = alloca i32
+; CHECK: %[[AI8:.*]] = bitcast i32* %a to i8*
+; CHECK: %[[BI8:.*]] = bitcast i32* %b to i8*
+; CHECK: %[[CI8:.*]] = bitcast i32* %c to i8*
+; CHECK-NOT: load
+; CHECK: br
+
+l.0.header:
+  %iv.0 = phi i64 [ %iv.0.next, %l.0.latch ], [ 0, %entry ]
+  %iv.0.next = add i64 %iv.0, 1
+  %exitcond.0 = icmp eq i64 %iv.0.next, %n.0
+  %a.val = load i32, i32* %a
+  store i32 %a.val, i32* %b
+  %c.val = trunc i64 %iv.0 to i32
+  store i32 %c.val, i32* %c
+  br label %l.0.0.header
+; CHECK: %[[AV:.*]] = load i32, i32* %a
+; CHECK: store i32 %[[AV]], i32* %b
+; CHECK: %[[CT:.*]] = trunc i64 {{.*}} to i32
+; CHECK: store i32 %[[CT]], i32* %c
+; CHECK: br
+
+l.0.0.header:
+  %iv.0.0 = phi i64 [ %iv.0.0.next, %l.0.0.latch ], [ 0, %l.0.header ]
+  %iv.0.0.next = add i64 %iv.0.0, 1
+  %exitcond.0.0 = icmp eq i64 %iv.0.0.next, %n.0.0
+  br label %l.0.0.0.header
+; CHECK: br
+
+l.0.0.0.header:
+  %iv.0.0.0 = phi i64 [ %iv.0.0.0.next, %l.0.0.0.header ], [ 0, %l.0.0.header ]
+  %iv.0.0.0.next = add i64 %iv.0.0.0, 1
+  %exitcond.0.0.0 = icmp eq i64 %iv.0.0.0.next, %n.0.0.0
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a.i8, i8* %c.i8, i64 4, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %b.i8, i8* %c.i8, i64 4, i32 1, i1 false)
+  br i1 %exitcond.0.0.0, label %l.0.0.0.header, label %l.0.0.latch
+; CHECK: call void @llvm.memcpy.{{.*}}(i8* %[[AI8]], i8* %[[CI8]], i64 4
+; CHECK: call void @llvm.memcpy.{{.*}}(i8* %[[BI8]], i8* %[[CI8]], i64 4
+; CHECK: br
+
+l.0.0.latch:
+  br i1 %exitcond.0.0, label %l.0.0.header, label %l.0.latch
+; CHECK: br
+
+l.0.latch:
+  %b.val = load i32, i32* %b
+  br i1 %exitcond.0, label %exit, label %l.0.header
+; CHECK: %[[BV:.*]] = load i32, i32* %b
+; CHECK: br
+
+exit:
+  %result.lcssa = phi i32 [ %b.val, %l.0.latch ]
+  ret i32 %b.val
+; CHECK: %[[LCSSA:.*]] = phi i32 [ %[[BV]], %{{.*}} ]
+; CHECK: ret i32 %[[LCSSA]]
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)