1 files changed, 187 insertions, 158 deletions
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 37a50cde6391..a5030305435c 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -89,7 +89,7 @@ namespace {
         AtomicRMWInst *I,
         TargetLoweringBase::AtomicExpansionKind ExpansionKind);
     AtomicRMWInst *widenPartwordAtomicRMW(AtomicRMWInst *AI);
-    void expandPartwordCmpXchg(AtomicCmpXchgInst *I);
+    bool expandPartwordCmpXchg(AtomicCmpXchgInst *I);
     void expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI);
     void expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI);
 
@@ -105,7 +105,7 @@ namespace {
     bool isIdempotentRMW(AtomicRMWInst *RMWI);
     bool simplifyIdempotentRMW(AtomicRMWInst *RMWI);
 
-    bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, unsigned Align,
+    bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, Align Alignment,
                                  Value *PointerOperand, Value *ValueOperand,
                                  Value *CASExpected, AtomicOrdering Ordering,
                                  AtomicOrdering Ordering2,
@@ -152,47 +152,15 @@ static unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) {
   return DL.getTypeStoreSize(CASI->getCompareOperand()->getType());
 }
 
-// Helper functions to retrieve the alignment of atomic instructions.
-static unsigned getAtomicOpAlign(LoadInst *LI) {
-  unsigned Align = LI->getAlignment();
-  // In the future, if this IR restriction is relaxed, we should
-  // return DataLayout::getABITypeAlignment when there's no align
-  // value.
-  assert(Align != 0 && "An atomic LoadInst always has an explicit alignment");
-  return Align;
-}
-
-static unsigned getAtomicOpAlign(StoreInst *SI) {
-  unsigned Align = SI->getAlignment();
-  // In the future, if this IR restriction is relaxed, we should
-  // return DataLayout::getABITypeAlignment when there's no align
-  // value.
-  assert(Align != 0 && "An atomic StoreInst always has an explicit alignment");
-  return Align;
-}
-
-static unsigned getAtomicOpAlign(AtomicRMWInst *RMWI) {
-  // TODO(PR27168): This instruction has no alignment attribute, but unlike the
-  // default alignment for load/store, the default here is to assume
-  // it has NATURAL alignment, not DataLayout-specified alignment.
-  const DataLayout &DL = RMWI->getModule()->getDataLayout();
-  return DL.getTypeStoreSize(RMWI->getValOperand()->getType());
-}
-
-static unsigned getAtomicOpAlign(AtomicCmpXchgInst *CASI) {
-  // TODO(PR27168): same comment as above.
-  const DataLayout &DL = CASI->getModule()->getDataLayout();
-  return DL.getTypeStoreSize(CASI->getCompareOperand()->getType());
-}
-
 // Determine if a particular atomic operation has a supported size,
 // and is of appropriate alignment, to be passed through for target
 // lowering. (Versus turning into a __atomic libcall)
 template <typename Inst>
 static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
   unsigned Size = getAtomicOpSize(I);
-  unsigned Align = getAtomicOpAlign(I);
-  return Align >= Size && Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
+  Align Alignment = I->getAlign();
+  return Alignment >= Size &&
+         Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
 }
 
 bool AtomicExpand::runOnFunction(Function &F) {
@@ -383,7 +351,7 @@ LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) {
   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
 
   auto *NewLI = Builder.CreateLoad(NewTy, NewAddr);
-  NewLI->setAlignment(MaybeAlign(LI->getAlignment()));
+  NewLI->setAlignment(LI->getAlign());
   NewLI->setVolatile(LI->isVolatile());
   NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
   LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
@@ -470,7 +438,7 @@ StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) {
   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
 
   StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr);
-  NewSI->setAlignment(MaybeAlign(SI->getAlignment()));
+  NewSI->setAlignment(SI->getAlign());
   NewSI->setVolatile(SI->isVolatile());
   NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID());
   LLVM_DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n");
@@ -570,8 +538,8 @@ bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
     unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
     unsigned ValueSize = getAtomicOpSize(AI);
     if (ValueSize < MinCASSize) {
-      llvm_unreachable(
-          "MinCmpXchgSizeInBits not yet supported for LL/SC architectures.");
+      expandPartwordAtomicRMW(AI,
+                              TargetLoweringBase::AtomicExpansionKind::LLSC);
     } else {
       auto PerformOp = [&](IRBuilder<> &Builder, Value *Loaded) {
         return performAtomicOp(AI->getOperation(), Builder, Loaded,
@@ -608,16 +576,43 @@ bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
 
 namespace {
 
-/// Result values from createMaskInstrs helper.
 struct PartwordMaskValues {
-  Type *WordType;
-  Type *ValueType;
-  Value *AlignedAddr;
-  Value *ShiftAmt;
-  Value *Mask;
-  Value *Inv_Mask;
+  // These three fields are guaranteed to be set by createMaskInstrs.
+  Type *WordType = nullptr;
+  Type *ValueType = nullptr;
+  Value *AlignedAddr = nullptr;
+  // The remaining fields can be null.
+  Value *ShiftAmt = nullptr;
+  Value *Mask = nullptr;
+  Value *Inv_Mask = nullptr;
 };
 
+LLVM_ATTRIBUTE_UNUSED
+raw_ostream &operator<<(raw_ostream &O, const PartwordMaskValues &PMV) {
+  auto PrintObj = [&O](auto *V) {
+    if (V)
+      O << *V;
+    else
+      O << "nullptr";
+    O << '\n';
+  };
+  O << "PartwordMaskValues {\n";
+  O << "  WordType: ";
+  PrintObj(PMV.WordType);
+  O << "  ValueType: ";
+  PrintObj(PMV.ValueType);
+  O << "  AlignedAddr: ";
+  PrintObj(PMV.AlignedAddr);
+  O << "  ShiftAmt: ";
+  PrintObj(PMV.ShiftAmt);
+  O << "  Mask: ";
+  PrintObj(PMV.Mask);
+  O << "  Inv_Mask: ";
+  PrintObj(PMV.Inv_Mask);
+  O << "}\n";
+  return O;
+}
+
 } // end anonymous namespace
 
 /// This is a helper function which builds instructions to provide
@@ -638,48 +633,74 @@ struct PartwordMaskValues {
 /// Inv_Mask: The inverse of Mask.
 static PartwordMaskValues createMaskInstrs(IRBuilder<> &Builder, Instruction *I,
                                            Type *ValueType, Value *Addr,
-                                           unsigned WordSize) {
-  PartwordMaskValues Ret;
+                                           unsigned MinWordSize) {
+  PartwordMaskValues PMV;
 
-  BasicBlock *BB = I->getParent();
-  Function *F = BB->getParent();
   Module *M = I->getModule();
-
-  LLVMContext &Ctx = F->getContext();
+  LLVMContext &Ctx = M->getContext();
   const DataLayout &DL = M->getDataLayout();
-
   unsigned ValueSize = DL.getTypeStoreSize(ValueType);
 
-  assert(ValueSize < WordSize);
+  PMV.ValueType = ValueType;
+  PMV.WordType = MinWordSize > ValueSize ? Type::getIntNTy(Ctx, MinWordSize * 8)
+                                         : ValueType;
+  if (PMV.ValueType == PMV.WordType) {
+    PMV.AlignedAddr = Addr;
+    return PMV;
+  }
 
-  Ret.ValueType = ValueType;
-  Ret.WordType = Type::getIntNTy(Ctx, WordSize * 8);
+  assert(ValueSize < MinWordSize);
 
   Type *WordPtrType =
-      Ret.WordType->getPointerTo(Addr->getType()->getPointerAddressSpace());
+      PMV.WordType->getPointerTo(Addr->getType()->getPointerAddressSpace());
 
   Value *AddrInt = Builder.CreatePtrToInt(Addr, DL.getIntPtrType(Ctx));
-  Ret.AlignedAddr = Builder.CreateIntToPtr(
-      Builder.CreateAnd(AddrInt, ~(uint64_t)(WordSize - 1)), WordPtrType,
+  PMV.AlignedAddr = Builder.CreateIntToPtr(
+      Builder.CreateAnd(AddrInt, ~(uint64_t)(MinWordSize - 1)), WordPtrType,
       "AlignedAddr");
 
-  Value *PtrLSB = Builder.CreateAnd(AddrInt, WordSize - 1, "PtrLSB");
+  Value *PtrLSB = Builder.CreateAnd(AddrInt, MinWordSize - 1, "PtrLSB");
   if (DL.isLittleEndian()) {
     // turn bytes into bits
-    Ret.ShiftAmt = Builder.CreateShl(PtrLSB, 3);
+    PMV.ShiftAmt = Builder.CreateShl(PtrLSB, 3);
   } else {
     // turn bytes into bits, and count from the other side.
-    Ret.ShiftAmt =
-        Builder.CreateShl(Builder.CreateXor(PtrLSB, WordSize - ValueSize), 3);
+    PMV.ShiftAmt = Builder.CreateShl(
+        Builder.CreateXor(PtrLSB, MinWordSize - ValueSize), 3);
   }
 
-  Ret.ShiftAmt = Builder.CreateTrunc(Ret.ShiftAmt, Ret.WordType, "ShiftAmt");
-  Ret.Mask = Builder.CreateShl(
-      ConstantInt::get(Ret.WordType, (1 << (ValueSize * 8)) - 1), Ret.ShiftAmt,
+  PMV.ShiftAmt = Builder.CreateTrunc(PMV.ShiftAmt, PMV.WordType, "ShiftAmt");
+  PMV.Mask = Builder.CreateShl(
+      ConstantInt::get(PMV.WordType, (1 << (ValueSize * 8)) - 1), PMV.ShiftAmt,
       "Mask");
-  Ret.Inv_Mask = Builder.CreateNot(Ret.Mask, "Inv_Mask");
+  PMV.Inv_Mask = Builder.CreateNot(PMV.Mask, "Inv_Mask");
+  return PMV;
+}
+
+static Value *extractMaskedValue(IRBuilder<> &Builder, Value *WideWord,
+                                 const PartwordMaskValues &PMV) {
+  assert(WideWord->getType() == PMV.WordType && "Widened type mismatch");
+  if (PMV.WordType == PMV.ValueType)
+    return WideWord;
+
+  Value *Shift = Builder.CreateLShr(WideWord, PMV.ShiftAmt, "shifted");
+  Value *Trunc = Builder.CreateTrunc(Shift, PMV.ValueType, "extracted");
+  return Trunc;
+}
 
-  return Ret;
+static Value *insertMaskedValue(IRBuilder<> &Builder, Value *WideWord,
+                                Value *Updated, const PartwordMaskValues &PMV) {
+  assert(WideWord->getType() == PMV.WordType && "Widened type mismatch");
+  assert(Updated->getType() == PMV.ValueType && "Value type mismatch");
+  if (PMV.WordType == PMV.ValueType)
+    return Updated;
+
+  Value *ZExt = Builder.CreateZExt(Updated, PMV.WordType, "extended");
+  Value *Shift =
+      Builder.CreateShl(ZExt, PMV.ShiftAmt, "shifted", /*HasNUW*/ true);
+  Value *And = Builder.CreateAnd(WideWord, PMV.Inv_Mask, "unmasked");
+  Value *Or = Builder.CreateOr(And, Shift, "inserted");
+  return Or;
 }
 
 /// Emit IR to implement a masked version of a given atomicrmw
@@ -719,13 +740,9 @@ static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op,
     // Finally, comparison ops will operate on the full value, so
     // truncate down to the original size, and expand out again after
     // doing the operation.
-    Value *Loaded_Shiftdown = Builder.CreateTrunc(
-        Builder.CreateLShr(Loaded, PMV.ShiftAmt), PMV.ValueType);
-    Value *NewVal = performAtomicOp(Op, Builder, Loaded_Shiftdown, Inc);
-    Value *NewVal_Shiftup = Builder.CreateShl(
-        Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt);
-    Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
-    Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shiftup);
+    Value *Loaded_Extract = extractMaskedValue(Builder, Loaded, PMV);
+    Value *NewVal = performAtomicOp(Op, Builder, Loaded_Extract, Inc);
+    Value *FinalVal = insertMaskedValue(Builder, Loaded, NewVal, PMV);
     return FinalVal;
   }
   default:
@@ -738,12 +755,10 @@ static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op,
 ///
 /// It will create an LL/SC or cmpxchg loop, as appropriate, the same
 /// way as a typical atomicrmw expansion. The only difference here is
-/// that the operation inside of the loop must operate only upon a
+/// that the operation inside of the loop may operate upon only a
 /// part of the value.
 void AtomicExpand::expandPartwordAtomicRMW(
     AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) {
-  assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg);
-
   AtomicOrdering MemOpOrder = AI->getOrdering();
 
   IRBuilder<> Builder(AI);
@@ -761,13 +776,18 @@ void AtomicExpand::expandPartwordAtomicRMW(
                                  ValOperand_Shifted, AI->getValOperand(), PMV);
   };
 
-  // TODO: When we're ready to support LLSC conversions too, use
-  // insertRMWLLSCLoop here for ExpansionKind==LLSC.
-  Value *OldResult =
-      insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr, MemOpOrder,
-                           PerformPartwordOp, createCmpXchgInstFun);
-  Value *FinalOldResult = Builder.CreateTrunc(
-      Builder.CreateLShr(OldResult, PMV.ShiftAmt), PMV.ValueType);
+  Value *OldResult;
+  if (ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg) {
+    OldResult =
+        insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr, MemOpOrder,
+                             PerformPartwordOp, createCmpXchgInstFun);
+  } else {
+    assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::LLSC);
+    OldResult = insertRMWLLSCLoop(Builder, PMV.WordType, PMV.AlignedAddr,
+                                  MemOpOrder, PerformPartwordOp);
+  }
+
+  Value *FinalOldResult = extractMaskedValue(Builder, OldResult, PMV);
   AI->replaceAllUsesWith(FinalOldResult);
   AI->eraseFromParent();
 }
@@ -800,14 +820,13 @@ AtomicRMWInst *AtomicExpand::widenPartwordAtomicRMW(AtomicRMWInst *AI) {
   AtomicRMWInst *NewAI = Builder.CreateAtomicRMW(Op, PMV.AlignedAddr,
                                                  NewOperand, AI->getOrdering());
 
-  Value *FinalOldResult = Builder.CreateTrunc(
-      Builder.CreateLShr(NewAI, PMV.ShiftAmt), PMV.ValueType);
+  Value *FinalOldResult = extractMaskedValue(Builder, NewAI, PMV);
   AI->replaceAllUsesWith(FinalOldResult);
   AI->eraseFromParent();
   return NewAI;
 }
 
-void AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) {
+bool AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) {
   // The basic idea here is that we're expanding a cmpxchg of a
   // smaller memory size up to a word-sized cmpxchg. To do this, we
   // need to add a retry-loop for strong cmpxchg, so that
@@ -923,14 +942,14 @@ void AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) {
   // partword.cmpxchg.end:
   Builder.SetInsertPoint(CI);
 
-  Value *FinalOldVal = Builder.CreateTrunc(
-      Builder.CreateLShr(OldVal, PMV.ShiftAmt), PMV.ValueType);
+  Value *FinalOldVal = extractMaskedValue(Builder, OldVal, PMV);
   Value *Res = UndefValue::get(CI->getType());
   Res = Builder.CreateInsertValue(Res, FinalOldVal, 0);
   Res = Builder.CreateInsertValue(Res, Success, 1);
 
   CI->replaceAllUsesWith(Res);
   CI->eraseFromParent();
+  return true;
 }
 
 void AtomicExpand::expandAtomicOpToLLSC(
@@ -965,8 +984,7 @@ void AtomicExpand::expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI) {
   Value *OldResult = TLI->emitMaskedAtomicRMWIntrinsic(
       Builder, AI, PMV.AlignedAddr, ValOperand_Shifted, PMV.Mask, PMV.ShiftAmt,
       AI->getOrdering());
-  Value *FinalOldResult = Builder.CreateTrunc(
-      Builder.CreateLShr(OldResult, PMV.ShiftAmt), PMV.ValueType);
+  Value *FinalOldResult = extractMaskedValue(Builder, OldResult, PMV);
   AI->replaceAllUsesWith(FinalOldResult);
   AI->eraseFromParent();
 }
@@ -987,9 +1005,7 @@ void AtomicExpand::expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI) {
   Value *OldVal = TLI->emitMaskedAtomicCmpXchgIntrinsic(
       Builder, CI, PMV.AlignedAddr, CmpVal_Shifted, NewVal_Shifted, PMV.Mask,
       CI->getSuccessOrdering());
-  Value *FinalOldVal = Builder.CreateTrunc(
-      Builder.CreateLShr(OldVal, PMV.ShiftAmt), PMV.ValueType);
-
+  Value *FinalOldVal = extractMaskedValue(Builder, OldVal, PMV);
   Value *Res = UndefValue::get(CI->getType());
   Res = Builder.CreateInsertValue(Res, FinalOldVal, 0);
   Value *Success = Builder.CreateICmpEQ(
@@ -1126,24 +1142,28 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   //
   // The full expansion we produce is:
   //     [...]
+  // %aligned.addr = ...
   // cmpxchg.start:
-  //     %unreleasedload = @load.linked(%addr)
-  //     %should_store = icmp eq %unreleasedload, %desired
-  //     br i1 %should_store, label %cmpxchg.fencedstore,
+  //     %unreleasedload = @load.linked(%aligned.addr)
+  //     %unreleasedload.extract = extract value from %unreleasedload
+  //     %should_store = icmp eq %unreleasedload.extract, %desired
+  //     br i1 %should_store, label %cmpxchg.releasingstore,
   //                          label %cmpxchg.nostore
   // cmpxchg.releasingstore:
   //     fence?
   //     br label cmpxchg.trystore
   // cmpxchg.trystore:
-  //     %loaded.trystore = phi [%unreleasedload, %releasingstore],
+  //     %loaded.trystore = phi [%unreleasedload, %cmpxchg.releasingstore],
   //                            [%releasedload, %cmpxchg.releasedload]
-  //     %stored = @store_conditional(%new, %addr)
+  //     %updated.new = insert %new into %loaded.trystore
+  //     %stored = @store_conditional(%updated.new, %aligned.addr)
   //     %success = icmp eq i32 %stored, 0
   //     br i1 %success, label %cmpxchg.success,
   //                     label %cmpxchg.releasedload/%cmpxchg.failure
   // cmpxchg.releasedload:
-  //     %releasedload = @load.linked(%addr)
-  //     %should_store = icmp eq %releasedload, %desired
+  //     %releasedload = @load.linked(%aligned.addr)
+  //     %releasedload.extract = extract value from %releasedload
+  //     %should_store = icmp eq %releasedload.extract, %desired
   //     br i1 %should_store, label %cmpxchg.trystore,
   //                          label %cmpxchg.failure
   // cmpxchg.success:
@@ -1159,9 +1179,10 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   //     fence?
   //     br label %cmpxchg.end
   // cmpxchg.end:
-  //     %loaded = phi [%loaded.nostore, %cmpxchg.failure],
-  //                   [%loaded.trystore, %cmpxchg.trystore]
+  //     %loaded.exit = phi [%loaded.nostore, %cmpxchg.failure],
+  //                        [%loaded.trystore, %cmpxchg.trystore]
   //     %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]
+  //     %loaded = extract value from %loaded.exit
   //     %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
   //     %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
   //     [...]
@@ -1187,13 +1208,20 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   Builder.SetInsertPoint(BB);
   if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier)
     TLI->emitLeadingFence(Builder, CI, SuccessOrder);
+
+  PartwordMaskValues PMV =
+      createMaskInstrs(Builder, CI, CI->getCompareOperand()->getType(), Addr,
+                       TLI->getMinCmpXchgSizeInBits() / 8);
   Builder.CreateBr(StartBB);
 
   // Start the main loop block now that we've taken care of the preliminaries.
   Builder.SetInsertPoint(StartBB);
-  Value *UnreleasedLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
+  Value *UnreleasedLoad =
+      TLI->emitLoadLinked(Builder, PMV.AlignedAddr, MemOpOrder);
+  Value *UnreleasedLoadExtract =
+      extractMaskedValue(Builder, UnreleasedLoad, PMV);
   Value *ShouldStore = Builder.CreateICmpEQ(
-      UnreleasedLoad, CI->getCompareOperand(), "should_store");
+      UnreleasedLoadExtract, CI->getCompareOperand(), "should_store");
 
   // If the cmpxchg doesn't actually need any ordering when it fails, we can
   // jump straight past that fence instruction (if it exists).
@@ -1205,8 +1233,13 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   Builder.CreateBr(TryStoreBB);
 
   Builder.SetInsertPoint(TryStoreBB);
-  Value *StoreSuccess = TLI->emitStoreConditional(
-      Builder, CI->getNewValOperand(), Addr, MemOpOrder);
+  PHINode *LoadedTryStore =
+      Builder.CreatePHI(PMV.WordType, 2, "loaded.trystore");
+  LoadedTryStore->addIncoming(UnreleasedLoad, ReleasingStoreBB);
+  Value *NewValueInsert =
+      insertMaskedValue(Builder, LoadedTryStore, CI->getNewValOperand(), PMV);
+  Value *StoreSuccess =
+      TLI->emitStoreConditional(Builder, NewValueInsert, Addr, MemOpOrder);
   StoreSuccess = Builder.CreateICmpEQ(
       StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
   BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB;
@@ -1216,13 +1249,16 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   Builder.SetInsertPoint(ReleasedLoadBB);
   Value *SecondLoad;
   if (HasReleasedLoadBB) {
-    SecondLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
-    ShouldStore = Builder.CreateICmpEQ(SecondLoad, CI->getCompareOperand(),
-                                       "should_store");
+    SecondLoad = TLI->emitLoadLinked(Builder, PMV.AlignedAddr, MemOpOrder);
+    Value *SecondLoadExtract = extractMaskedValue(Builder, SecondLoad, PMV);
+    ShouldStore = Builder.CreateICmpEQ(SecondLoadExtract,
+                                       CI->getCompareOperand(), "should_store");
 
     // If the cmpxchg doesn't actually need any ordering when it fails, we can
     // jump straight past that fence instruction (if it exists).
     Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB);
+    // Update PHI node in TryStoreBB.
+    LoadedTryStore->addIncoming(SecondLoad, ReleasedLoadBB);
   } else
     Builder.CreateUnreachable();
 
@@ -1234,6 +1270,12 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   Builder.CreateBr(ExitBB);
 
   Builder.SetInsertPoint(NoStoreBB);
+  PHINode *LoadedNoStore =
+      Builder.CreatePHI(UnreleasedLoad->getType(), 2, "loaded.nostore");
+  LoadedNoStore->addIncoming(UnreleasedLoad, StartBB);
+  if (HasReleasedLoadBB)
+    LoadedNoStore->addIncoming(SecondLoad, ReleasedLoadBB);
+
   // In the failing case, where we don't execute the store-conditional, the
   // target might want to balance out the load-linked with a dedicated
   // instruction (e.g., on ARM, clearing the exclusive monitor).
@@ -1241,6 +1283,11 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   Builder.CreateBr(FailureBB);
 
   Builder.SetInsertPoint(FailureBB);
+  PHINode *LoadedFailure =
+      Builder.CreatePHI(UnreleasedLoad->getType(), 2, "loaded.failure");
+  LoadedFailure->addIncoming(LoadedNoStore, NoStoreBB);
+  if (CI->isWeak())
+    LoadedFailure->addIncoming(LoadedTryStore, TryStoreBB);
   if (ShouldInsertFencesForAtomic)
     TLI->emitTrailingFence(Builder, CI, FailureOrder);
   Builder.CreateBr(ExitBB);
@@ -1250,32 +1297,20 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate
   // PHI.
   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
-  PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2);
+  PHINode *LoadedExit =
+      Builder.CreatePHI(UnreleasedLoad->getType(), 2, "loaded.exit");
+  LoadedExit->addIncoming(LoadedTryStore, SuccessBB);
+  LoadedExit->addIncoming(LoadedFailure, FailureBB);
+  PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2, "success");
   Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB);
   Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB);
 
-  // Setup the builder so we can create any PHIs we need.
-  Value *Loaded;
-  if (!HasReleasedLoadBB)
-    Loaded = UnreleasedLoad;
-  else {
-    Builder.SetInsertPoint(TryStoreBB, TryStoreBB->begin());
-    PHINode *TryStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
-    TryStoreLoaded->addIncoming(UnreleasedLoad, ReleasingStoreBB);
-    TryStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB);
-
-    Builder.SetInsertPoint(NoStoreBB, NoStoreBB->begin());
-    PHINode *NoStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
-    NoStoreLoaded->addIncoming(UnreleasedLoad, StartBB);
-    NoStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB);
-
-    Builder.SetInsertPoint(ExitBB, ++ExitBB->begin());
-    PHINode *ExitLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
-    ExitLoaded->addIncoming(TryStoreLoaded, SuccessBB);
-    ExitLoaded->addIncoming(NoStoreLoaded, FailureBB);
-
-    Loaded = ExitLoaded;
-  }
+  // This is the "exit value" from the cmpxchg expansion. It may be of
+  // a type wider than the one in the cmpxchg instruction.
+  Value *LoadedFull = LoadedExit;
+
+  Builder.SetInsertPoint(ExitBB, std::next(Success->getIterator()));
+  Value *Loaded = extractMaskedValue(Builder, LoadedFull, PMV);
 
   // Look for any users of the cmpxchg that are just comparing the loaded value
   // against the desired one, and replace them with the CFG-derived version.
@@ -1377,7 +1412,7 @@ Value *AtomicExpand::insertRMWCmpXchgLoop(
   Builder.SetInsertPoint(BB);
   LoadInst *InitLoaded = Builder.CreateLoad(ResultTy, Addr);
   // Atomics require at least natural alignment.
-  InitLoaded->setAlignment(MaybeAlign(ResultTy->getPrimitiveSizeInBits() / 8));
+  InitLoaded->setAlignment(Align(ResultTy->getPrimitiveSizeInBits() / 8));
   Builder.CreateBr(LoopBB);
 
   // Start the main loop block now that we've taken care of the preliminaries.
@@ -1414,11 +1449,9 @@ bool AtomicExpand::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
     llvm_unreachable("Unhandled case in tryExpandAtomicCmpXchg");
   case TargetLoweringBase::AtomicExpansionKind::None:
     if (ValueSize < MinCASSize)
-      expandPartwordCmpXchg(CI);
+      return expandPartwordCmpXchg(CI);
     return false;
   case TargetLoweringBase::AtomicExpansionKind::LLSC: {
-    assert(ValueSize >= MinCASSize &&
-           "MinCmpXchgSizeInBits not yet supported for LL/SC expansions.");
     return expandAtomicCmpXchg(CI);
   }
   case TargetLoweringBase::AtomicExpansionKind::MaskedIntrinsic:
@@ -1449,7 +1482,7 @@ bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
 // must be one of the potentially-specialized sizes, and the value
 // type must actually exist in C on the target (otherwise, the
 // function wouldn't actually be defined.)
-static bool canUseSizedAtomicCall(unsigned Size, unsigned Align,
+static bool canUseSizedAtomicCall(unsigned Size, Align Alignment,
                                   const DataLayout &DL) {
   // TODO: "LargestSize" is an approximation for "largest type that
   // you can express in C". It seems to be the case that int128 is
@@ -1459,7 +1492,7 @@ static bool canUseSizedAtomicCall(unsigned Size, unsigned Align,
   // really be some more reliable way in LLVM of determining integer
   // sizes which are valid in the target's C ABI...
   unsigned LargestSize = DL.getLargestLegalIntTypeSizeInBits() >= 64 ? 16 : 8;
-  return Align >= Size &&
+  return Alignment >= Size &&
          (Size == 1 || Size == 2 || Size == 4 || Size == 8 || Size == 16) &&
          Size <= LargestSize;
 }
@@ -1469,10 +1502,9 @@ void AtomicExpand::expandAtomicLoadToLibcall(LoadInst *I) {
       RTLIB::ATOMIC_LOAD,   RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2,
       RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16};
   unsigned Size = getAtomicOpSize(I);
-  unsigned Align = getAtomicOpAlign(I);
 
   bool expanded = expandAtomicOpToLibcall(
-      I, Size, Align, I->getPointerOperand(), nullptr, nullptr,
+      I, Size, I->getAlign(), I->getPointerOperand(), nullptr, nullptr,
       I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
   (void)expanded;
   assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Load");
@@ -1483,11 +1515,10 @@ void AtomicExpand::expandAtomicStoreToLibcall(StoreInst *I) {
       RTLIB::ATOMIC_STORE,   RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2,
       RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16};
   unsigned Size = getAtomicOpSize(I);
-  unsigned Align = getAtomicOpAlign(I);
 
   bool expanded = expandAtomicOpToLibcall(
-      I, Size, Align, I->getPointerOperand(), I->getValueOperand(), nullptr,
-      I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
+      I, Size, I->getAlign(), I->getPointerOperand(), I->getValueOperand(),
+      nullptr, I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
   (void)expanded;
   assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Store");
 }
@@ -1498,10 +1529,9 @@ void AtomicExpand::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) {
       RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4,
       RTLIB::ATOMIC_COMPARE_EXCHANGE_8, RTLIB::ATOMIC_COMPARE_EXCHANGE_16};
   unsigned Size = getAtomicOpSize(I);
-  unsigned Align = getAtomicOpAlign(I);
 
   bool expanded = expandAtomicOpToLibcall(
-      I, Size, Align, I->getPointerOperand(), I->getNewValOperand(),
+      I, Size, I->getAlign(), I->getPointerOperand(), I->getNewValOperand(),
       I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(),
       Libcalls);
   (void)expanded;
@@ -1571,13 +1601,12 @@ void AtomicExpand::expandAtomicRMWToLibcall(AtomicRMWInst *I) {
   ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(I->getOperation());
 
   unsigned Size = getAtomicOpSize(I);
-  unsigned Align = getAtomicOpAlign(I);
 
   bool Success = false;
   if (!Libcalls.empty())
     Success = expandAtomicOpToLibcall(
-        I, Size, Align, I->getPointerOperand(), I->getValOperand(), nullptr,
-        I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
+        I, Size, I->getAlign(), I->getPointerOperand(), I->getValOperand(),
+        nullptr, I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
 
   // The expansion failed: either there were no libcalls at all for
   // the operation (min/max), or there were only size-specialized
@@ -1608,7 +1637,7 @@ void AtomicExpand::expandAtomicRMWToLibcall(AtomicRMWInst *I) {
 // 'I' are extracted from the Instruction subclass by the
 // caller. Depending on the particular call, some will be null.
 bool AtomicExpand::expandAtomicOpToLibcall(
-    Instruction *I, unsigned Size, unsigned Align, Value *PointerOperand,
+    Instruction *I, unsigned Size, Align Alignment, Value *PointerOperand,
     Value *ValueOperand, Value *CASExpected, AtomicOrdering Ordering,
     AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) {
   assert(Libcalls.size() == 6);
@@ -1619,10 +1648,10 @@ bool AtomicExpand::expandAtomicOpToLibcall(
   IRBuilder<> Builder(I);
   IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front());
 
-  bool UseSizedLibcall = canUseSizedAtomicCall(Size, Align, DL);
+  bool UseSizedLibcall = canUseSizedAtomicCall(Size, Alignment, DL);
   Type *SizedIntTy = Type::getIntNTy(Ctx, Size * 8);
 
-  unsigned AllocaAlignment = DL.getPrefTypeAlignment(SizedIntTy);
+  const Align AllocaAlignment = DL.getPrefTypeAlign(SizedIntTy);
 
   // TODO: the "order" argument type is "int", not int32. So
   // getInt32Ty may be wrong if the arch uses e.g. 16-bit ints.
@@ -1712,7 +1741,7 @@ bool AtomicExpand::expandAtomicOpToLibcall(
   // 'expected' argument, if present.
   if (CASExpected) {
     AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType());
-    AllocaCASExpected->setAlignment(MaybeAlign(AllocaAlignment));
+    AllocaCASExpected->setAlignment(AllocaAlignment);
     unsigned AllocaAS =  AllocaCASExpected->getType()->getPointerAddressSpace();
 
     AllocaCASExpected_i8 =
@@ -1731,7 +1760,7 @@ bool AtomicExpand::expandAtomicOpToLibcall(
       Args.push_back(IntValue);
     } else {
       AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType());
-      AllocaValue->setAlignment(MaybeAlign(AllocaAlignment));
+      AllocaValue->setAlignment(AllocaAlignment);
       AllocaValue_i8 =
           Builder.CreateBitCast(AllocaValue, Type::getInt8PtrTy(Ctx));
       Builder.CreateLifetimeStart(AllocaValue_i8, SizeVal64);
@@ -1743,7 +1772,7 @@ bool AtomicExpand::expandAtomicOpToLibcall(
   // 'ret' argument.
   if (!CASExpected && HasResult && !UseSizedLibcall) {
     AllocaResult = AllocaBuilder.CreateAlloca(I->getType());
-    AllocaResult->setAlignment(MaybeAlign(AllocaAlignment));
+    AllocaResult->setAlignment(AllocaAlignment);
     unsigned AllocaAS =  AllocaResult->getType()->getPointerAddressSpace();
     AllocaResult_i8 =
       Builder.CreateBitCast(AllocaResult, Type::getInt8PtrTy(Ctx, AllocaAS));