129 files changed, 10102 insertions, 976 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 3ef3c8b840cb..f398117de953 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2487,15 +2487,36 @@ static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
   return true;
 }
 
-/// Return true when there is potentially a faster code sequence
-/// for an instruction chain ending in \p Root. All potential patterns are
-/// listed
-/// in the \p Pattern vector. Pattern should be sorted in priority order since
-/// the pattern evaluator stops checking as soon as it finds a faster sequence.
+// TODO: There are many more machine instruction opcodes to match:
+//       1. Other data types (integer, vectors)
+//       2. Other math / logic operations (xor, or)
+//       3. Other forms of the same operation (intrinsics and other variants)
+bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
+  switch (Inst.getOpcode()) {
+  case AArch64::FADDDrr:
+  case AArch64::FADDSrr:
+  case AArch64::FADDv2f32:
+  case AArch64::FADDv2f64:
+  case AArch64::FADDv4f32:
+  case AArch64::FMULDrr:
+  case AArch64::FMULSrr:
+  case AArch64::FMULX32:
+  case AArch64::FMULX64:
+  case AArch64::FMULXv2f32:
+  case AArch64::FMULXv2f64:
+  case AArch64::FMULXv4f32:
+  case AArch64::FMULv2f32:
+  case AArch64::FMULv2f64:
+  case AArch64::FMULv4f32:
+    return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
+  default:
+    return false;
+  }
+}
 
-bool AArch64InstrInfo::getMachineCombinerPatterns(
-    MachineInstr &Root,
-    SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+/// Find instructions that can be turned into madd.
+static bool getMaddPatterns(MachineInstr &Root,
+                            SmallVectorImpl<MachineCombinerPattern> &Patterns) {
   unsigned Opc = Root.getOpcode();
   MachineBasicBlock &MBB = *Root.getParent();
   bool Found = false;
@@ -2600,6 +2621,20 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
   return Found;
 }
 
+/// Return true when there is potentially a faster code sequence for an
+/// instruction chain ending in \p Root. All potential patterns are listed in
+/// the \p Pattern vector. Pattern should be sorted in priority order since the
+/// pattern evaluator stops checking as soon as it finds a faster sequence.
+
+bool AArch64InstrInfo::getMachineCombinerPatterns(
+    MachineInstr &Root,
+    SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+  if (getMaddPatterns(Root, Patterns))
+    return true;
+
+  return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
+}
+
 /// genMadd - Generate madd instruction and combine mul and add.
 /// Example:
 ///  MUL I=A,B,0
@@ -2713,8 +2748,10 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
   unsigned Opc;
   switch (Pattern) {
   default:
-    // signal error.
-    break;
+    // Reassociate instructions.
+    TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
+                                                DelInstrs, InstrIdxForVirtReg);
+    return;
   case MachineCombinerPattern::MULADDW_OP1:
   case MachineCombinerPattern::MULADDX_OP1:
     // MUL I=A,B,0
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index ae02822a32e6..b5bb446f8c16 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -169,7 +169,9 @@ public:
   bool getMachineCombinerPatterns(MachineInstr &Root,
                   SmallVectorImpl<MachineCombinerPattern> &Patterns)
       const override;
-
+  /// Return true when Inst is associative and commutative so that it can be
+  /// reassociated.
+  bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
   /// When getMachineCombinerPatterns() finds patterns, this function generates
   /// the instructions that could replace the original code sequence
   void genAlternativeCodeSequence(
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
index 8c3cb567fc7e..5d00e1cb3be5 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -20,8 +20,10 @@ class AMDGPUInstrPrinter;
 class AMDGPUSubtarget;
 class AMDGPUTargetMachine;
 class FunctionPass;
+class MachineSchedContext;
 class MCAsmInfo;
 class raw_ostream;
+class ScheduleDAGInstrs;
 class Target;
 class TargetMachine;
 
@@ -49,6 +51,8 @@ FunctionPass *createSIFixSGPRLiveRangesPass();
 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 FunctionPass *createSIInsertWaits(TargetMachine &tm);
 
+ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C);
+
 ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
 void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
 extern char &AMDGPUAnnotateKernelFeaturesID;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 9c3790264377..1239dfb235ef 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -91,6 +91,25 @@ AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
                                    std::unique_ptr<MCStreamer> Streamer)
     : AsmPrinter(TM, std::move(Streamer)) {}
 
+void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
+    return;
+
+  // Need to construct an MCSubtargetInfo here in case we have no functions
+  // in the module.
+  std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
+        TM.getTargetTriple().str(), TM.getTargetCPU(),
+        TM.getTargetFeatureString()));
+
+  AMDGPUTargetStreamer *TS =
+      static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+
+  TS->EmitDirectiveHSACodeObjectVersion(1, 0);
+  AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits());
+  TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping,
+                                    "AMD", "AMDGPU");
+}
+
 void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
   SIProgramInfo KernelInfo;
@@ -148,11 +167,15 @@ void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     TS->EmitAMDGPUHsaProgramScopeGlobal(GV->getName());
   }
 
+  MCSymbolELF *GVSym = cast<MCSymbolELF>(getSymbol(GV));
   const DataLayout &DL = getDataLayout();
+
+  // Emit the size
+  uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
+  OutStreamer->emitELFSize(GVSym, MCConstantExpr::create(Size, OutContext));
   OutStreamer->PushSection();
   OutStreamer->SwitchSection(
       getObjFileLowering().SectionForGlobal(GV, *Mang, TM));
-  MCSymbol *GVSym = getSymbol(GV);
   const Constant *C = GV->getInitializer();
   OutStreamer->EmitLabel(GVSym);
   EmitGlobalConstant(DL, C);
@@ -178,13 +201,6 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     if (!STM.isAmdHsaOS()) {
       EmitProgramInfoSI(MF, KernelInfo);
     }
-    // Emit directives
-    AMDGPUTargetStreamer *TS =
-        static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
-    TS->EmitDirectiveHSACodeObjectVersion(1, 0);
-    AMDGPU::IsaVersion ISA = STM.getIsaVersion();
-    TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping,
-                                      "AMD", "AMDGPU");
   } else {
     EmitProgramInfoR600(MF);
   }
@@ -417,16 +433,24 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
     }
   }
 
-  if (VCCUsed || FlatUsed || STM.isXNACKEnabled()) {
-    MaxSGPR += 2;
+  unsigned ExtraSGPRs = 0;
 
-    if (FlatUsed)
-      MaxSGPR += 2;
+  if (VCCUsed)
+    ExtraSGPRs = 2;
 
+  if (STM.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    if (FlatUsed)
+      ExtraSGPRs = 4;
+  } else {
     if (STM.isXNACKEnabled())
-      MaxSGPR += 2;
+      ExtraSGPRs = 4;
+
+    if (FlatUsed)
+      ExtraSGPRs = 6;
   }
 
+  MaxSGPR += ExtraSGPRs;
+
   // We found the maximum register index. They start at 0, so add one to get the
   // number of registers.
   ProgInfo.NumVGPR = MaxVGPR + 1;
@@ -563,7 +587,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
     OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
     OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
     OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
-    OutStreamer->EmitIntValue(MFI->PSInputAddr, 4);
+    OutStreamer->EmitIntValue(MFI->PSInputEna, 4);
+    OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
+    OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
   }
 }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 817cbfc0c0eb..99d4091670fe 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -103,6 +103,8 @@ public:
 
   void EmitGlobalVariable(const GlobalVariable *GV) override;
 
+  void EmitStartOfAsmFile(Module &M) override;
+
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                        unsigned AsmVariant, const char *ExtraCode,
                        raw_ostream &O) override;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 6ffa7a083583..b0db26124a0c 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -20,28 +20,83 @@ def CC_SI : CallingConv<[
   CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
-    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21
+    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
+    SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
+    SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39
   ]>>>,
 
   CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow<
-    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
-    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
+    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14,
+      SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30,
+      SGPR32, SGPR34, SGPR36, SGPR38 ],
+    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15,
+      SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31,
+      SGPR33, SGPR35, SGPR37, SGPR39 ]
   >>>,
 
+  // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
   CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
-    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31,
+    VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39,
+    VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47,
+    VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55,
+    VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63,
+    VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71,
+    VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79,
+    VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87,
+    VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95,
+    VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103,
+    VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111,
+    VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119,
+    VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127,
+    VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135
   ]>>>,
 
   CCIfByVal<CCIfType<[i64] , CCAssignToRegWithShadow<
-    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
-    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
+    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14,
+      SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30,
+      SGPR32, SGPR34, SGPR36, SGPR38 ],
+    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15,
+      SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31,
+      SGPR33, SGPR35, SGPR37, SGPR39 ]
   >>>
 
 ]>;
 
+def RetCC_SI : CallingConv<[
+  CCIfType<[i32] , CCAssignToReg<[
+    SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
+    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
+    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
+    SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
+    SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39
+  ]>>,
+
+  // 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
+  CCIfType<[f32] , CCAssignToReg<[
+    VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+    VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+    VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31,
+    VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39,
+    VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47,
+    VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55,
+    VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63,
+    VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71,
+    VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79,
+    VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87,
+    VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95,
+    VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103,
+    VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111,
+    VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119,
+    VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127,
+    VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135
+  ]>>
+]>;
+
 // Calling convention for R600
 def CC_R600 : CallingConv<[
   CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 222f63161be5..1a59a460ee7d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -282,12 +282,19 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
   setOperationAction(ISD::SMAX, MVT::i32, Legal);
   setOperationAction(ISD::UMAX, MVT::i32, Legal);
 
-  if (!Subtarget->hasFFBH())
+  if (Subtarget->hasFFBH())
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
+  else
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
 
   if (!Subtarget->hasFFBL())
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
 
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+
+  setOperationAction(ISD::CTLZ, MVT::i64, Custom);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+
   static const MVT::SimpleValueType VectorIntTypes[] = {
     MVT::v2i32, MVT::v4i32
   };
@@ -565,6 +572,12 @@ void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
   State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
 }
 
+void AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs) const {
+
+  State.AnalyzeReturn(Outs, RetCC_SI);
+}
+
 SDValue AMDGPUTargetLowering::LowerReturn(
                                      SDValue Chain,
                                      CallingConv::ID CallConv,
@@ -633,6 +646,9 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
   case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
   case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
+  case ISD::CTLZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+    return LowerCTLZ(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
   }
   return Op;
@@ -2159,6 +2175,145 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
 }
 
+SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+  bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
+
+  if (ZeroUndef && Src.getValueType() == MVT::i32)
+    return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src);
+
+  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
+
+  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
+
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
+
+  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
+                                   *DAG.getContext(), MVT::i32);
+
+  SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ);
+
+  SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo);
+  SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi);
+
+  const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
+  SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32);
+
+  // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
+  SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi);
+
+  if (!ZeroUndef) {
+    // Test if the full 64-bit input is zero.
+
+    // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
+    // which we probably don't want.
+    SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ);
+    SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0);
+
+    // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
+    // with the same cycles, otherwise it is slower.
+    // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
+    // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
+
+    const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
+
+    // The instruction returns -1 for 0 input, but the defined intrinsic
+    // behavior is to return the number of bits.
+    NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32,
+                          SrcIsZero, Bits32, NewCtlz);
+  }
+
+  return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz);
+}
+
+SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
+                                               bool Signed) const {
+  // Unsigned
+  // cul2f(ulong u)
+  //{
+  //  uint lz = clz(u);
+  //  uint e = (u != 0) ? 127U + 63U - lz : 0;
+  //  u = (u << lz) & 0x7fffffffffffffffUL;
+  //  ulong t = u & 0xffffffffffUL;
+  //  uint v = (e << 23) | (uint)(u >> 40);
+  //  uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
+  //  return as_float(v + r);
+  //}
+  // Signed
+  // cl2f(long l)
+  //{
+  //  long s = l >> 63;
+  //  float r = cul2f((l + s) ^ s);
+  //  return s ? -r : r;
+  //}
+
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+  SDValue L = Src;
+
+  SDValue S;
+  if (Signed) {
+    const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
+    S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
+
+    SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
+    L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
+  }
+
+  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
+                                   *DAG.getContext(), MVT::f32);
+
+
+  SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
+  SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
+  SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
+  LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
+
+  SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
+  SDValue E = DAG.getSelect(SL, MVT::i32,
+    DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
+    DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
+    ZeroI32);
+
+  SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
+    DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
+    DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
+
+  SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
+                          DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
+
+  SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
+                             U, DAG.getConstant(40, SL, MVT::i64));
+
+  SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
+    DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
+    DAG.getNode(ISD::TRUNCATE, SL, MVT::i32,  UShl));
+
+  SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
+  SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
+  SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
+
+  SDValue One = DAG.getConstant(1, SL, MVT::i32);
+
+  SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
+
+  SDValue R = DAG.getSelect(SL, MVT::i32,
+    RCmp,
+    One,
+    DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
+  R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
+  R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
+
+  if (!Signed)
+    return R;
+
+  SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
+  return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
+}
+
 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
                                                bool Signed) const {
   SDLoc SL(Op);
@@ -2184,35 +2339,29 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
 
 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
                                                SelectionDAG &DAG) const {
-  SDValue S0 = Op.getOperand(0);
-  if (S0.getValueType() != MVT::i64)
-    return SDValue();
+  assert(Op.getOperand(0).getValueType() == MVT::i64 &&
+         "operation should be legal");
 
   EVT DestVT = Op.getValueType();
   if (DestVT == MVT::f64)
     return LowerINT_TO_FP64(Op, DAG, false);
 
-  assert(DestVT == MVT::f32);
-
-  SDLoc DL(Op);
+  if (DestVT == MVT::f32)
+    return LowerINT_TO_FP32(Op, DAG, false);
 
-  // f32 uint_to_fp i64
-  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
-                           DAG.getConstant(0, DL, MVT::i32));
-  SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo);
-  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
-                           DAG.getConstant(1, DL, MVT::i32));
-  SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi);
-  // TODO: Should this propagate fast-math-flags?
-  FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi,
-                        DAG.getConstantFP(4294967296.0f, DL, MVT::f32)); // 2^32
-  return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);
+  return SDValue();
 }
 
 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
                                               SelectionDAG &DAG) const {
-  SDValue Src = Op.getOperand(0);
-  if (Src.getValueType() == MVT::i64 && Op.getValueType() == MVT::f64)
+  assert(Op.getOperand(0).getValueType() == MVT::i64 &&
+         "operation should be legal");
+
+  EVT DestVT = Op.getValueType();
+  if (DestVT == MVT::f32)
+    return LowerINT_TO_FP32(Op, DAG, true);
+
+  if (DestVT == MVT::f64)
     return LowerINT_TO_FP64(Op, DAG, true);
 
   return SDValue();
@@ -2447,6 +2596,97 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
   return DAG.getSExtOrTrunc(Mul, DL, VT);
 }
 
+static bool isNegativeOne(SDValue Val) {
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
+    return C->isAllOnesValue();
+  return false;
+}
+
+static bool isCtlzOpc(unsigned Opc) {
+  return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
+}
+
+// Get FFBH node if the incoming op may have been type legalized from a smaller
+// type VT.
+// Need to match pre-legalized type because the generic legalization inserts the
+// add/sub between the select and compare.
+static SDValue getFFBH_U32(const TargetLowering &TLI,
+                           SelectionDAG &DAG, SDLoc SL, SDValue Op) {
+  EVT VT = Op.getValueType();
+  EVT LegalVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  if (LegalVT != MVT::i32)
+    return SDValue();
+
+  if (VT != MVT::i32)
+    Op = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Op);
+
+  SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Op);
+  if (VT != MVT::i32)
+    FFBH = DAG.getNode(ISD::TRUNCATE, SL, VT, FFBH);
+
+  return FFBH;
+}
+
+// The native instructions return -1 on 0 input. Optimize out a select that
+// produces -1 on 0.
+//
+// TODO: If zero is not undef, we could also do this if the output is compared
+// against the bitwidth.
+//
+// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
+SDValue AMDGPUTargetLowering::performCtlzCombine(SDLoc SL,
+                                                 SDValue Cond,
+                                                 SDValue LHS,
+                                                 SDValue RHS,
+                                                 DAGCombinerInfo &DCI) const {
+  ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
+  if (!CmpRhs || !CmpRhs->isNullValue())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+  SDValue CmpLHS = Cond.getOperand(0);
+
+  // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
+  if (CCOpcode == ISD::SETEQ &&
+      isCtlzOpc(RHS.getOpcode()) &&
+      RHS.getOperand(0) == CmpLHS &&
+      isNegativeOne(LHS)) {
+    return getFFBH_U32(*this, DAG, SL, CmpLHS);
+  }
+
+  // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
+  if (CCOpcode == ISD::SETNE &&
+      isCtlzOpc(LHS.getOpcode()) &&
+      LHS.getOperand(0) == CmpLHS &&
+      isNegativeOne(RHS)) {
+    return getFFBH_U32(*this, DAG, SL, CmpLHS);
+  }
+
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
+                                                   DAGCombinerInfo &DCI) const {
+  SDValue Cond = N->getOperand(0);
+  if (Cond.getOpcode() != ISD::SETCC)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDValue LHS = Cond.getOperand(0);
+  SDValue RHS = Cond.getOperand(1);
+  SDValue CC = Cond.getOperand(2);
+
+  SDValue True = N->getOperand(1);
+  SDValue False = N->getOperand(2);
+
+  if (VT == MVT::f32 && Cond.hasOneUse())
+    return CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
+
+  // There's no reason to not do this if the condition has other uses.
+  return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
+}
+
 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
                                                 DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -2471,23 +2711,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     simplifyI24(N1, DCI);
     return SDValue();
   }
-  case ISD::SELECT: {
-    SDValue Cond = N->getOperand(0);
-    if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) {
-      EVT VT = N->getValueType(0);
-      SDValue LHS = Cond.getOperand(0);
-      SDValue RHS = Cond.getOperand(1);
-      SDValue CC = Cond.getOperand(2);
-
-      SDValue True = N->getOperand(1);
-      SDValue False = N->getOperand(2);
-
-      if (VT == MVT::f32)
-        return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
-    }
-
-    break;
-  }
+  case ISD::SELECT:
+    return performSelectCombine(N, DCI);
   case AMDGPUISD::BFE_I32:
   case AMDGPUISD::BFE_U32: {
     assert(!N->getValueType(0).isVector() &&
@@ -2699,6 +2924,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(BFE_I32)
   NODE_NAME_CASE(BFI)
   NODE_NAME_CASE(BFM)
+  NODE_NAME_CASE(FFBH_U32)
   NODE_NAME_CASE(MUL_U24)
   NODE_NAME_CASE(MUL_I24)
   NODE_NAME_CASE(MAD_U24)
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 7314cc050ba5..37925416a9c4 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -54,6 +54,9 @@ private:
   SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const;
   SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
   SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
@@ -67,6 +70,9 @@ private:
   SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performCtlzCombine(SDLoc SL, SDValue Cond, SDValue LHS, SDValue RHS,
+                             DAGCombinerInfo &DCI) const;
+  SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
 protected:
   static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
@@ -109,6 +115,8 @@ protected:
                                SmallVectorImpl<ISD::InputArg> &OrigIns) const;
   void AnalyzeFormalArguments(CCState &State,
                               const SmallVectorImpl<ISD::InputArg> &Ins) const;
+  void AnalyzeReturn(CCState &State,
+                     const SmallVectorImpl<ISD::OutputArg> &Outs) const;
 
 public:
   AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI);
@@ -263,6 +271,7 @@ enum NodeType : unsigned {
   BFE_I32, // Extract range of bits with sign extension to 32-bits.
   BFI, // (src0 & src1) | (~src0 & src2)
   BFM, // Insert a range of bits into a 32-bit word.
+  FFBH_U32, // ctlz with -1 if input is zero.
   MUL_U24,
   MUL_I24,
   MAD_U24,
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 70e589c28429..575dfe413658 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -191,6 +191,8 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
 def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
 def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
 
+def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>;
+
 // Signed and unsigned 24-bit mulitply.  The highest 8-bits are ignore when
 // performing the mulitply.  The result is a 32-bit value.
 def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
@@ -240,4 +242,4 @@ def IL_brcond      : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChai
 // Call/Return DAG Nodes
 //===----------------------------------------------------------------------===//
 def IL_retflag       : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
-    [SDNPHasChain, SDNPOptInGlue]>;
+    [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 22f85b3e663c..b1be6197a6c6 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -66,8 +66,12 @@ static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
 }
 
 static MachineSchedRegistry
-SchedCustomRegistry("r600", "Run R600's custom scheduler",
-                    createR600MachineScheduler);
+R600SchedRegistry("r600", "Run R600's custom scheduler",
+                   createR600MachineScheduler);
+
+static MachineSchedRegistry
+SISchedRegistry("si", "Run SI's custom scheduler",
+                createSIMachineScheduler);
 
 static std::string computeDataLayout(const Triple &TT) {
   std::string Ret = "e-p:32:32";
diff --git a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 779a14e95d22..2245f1417e53 100644
--- a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -349,7 +349,7 @@ def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
 def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>;
 def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>;
 
-def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", ctlz_zero_undef, VecALU>;
+def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>;
 def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>;
 
 let hasSideEffects = 1 in {
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 68b1d1ae83cc..4bc80a028936 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -28,7 +28,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
   //===--- Global Variable Emission Directives --------------------------===//
   HasAggressiveSymbolFolding = true;
   COMMDirectiveAlignmentIsInBytes = false;
-  HasDotTypeDotSizeDirective = false;
   HasNoDeadStrip = true;
   WeakRefDirective = ".weakref\t";
   //===--- Dwarf Emission Directives -----------------------------------===//
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
index 7f79dd34f3ba..aa1e352ed748 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -137,7 +137,7 @@ namespace SIOutMods {
 #define   C_00B84C_EXCP_EN 
 
 #define R_0286CC_SPI_PS_INPUT_ENA                                       0x0286CC
-
+#define R_0286D0_SPI_PS_INPUT_ADDR                                      0x0286D0
 
 #define R_00B848_COMPUTE_PGM_RSRC1                                      0x00B848
 #define   S_00B848_VGPRS(x)                                           (((x) & 0x3F) << 0)
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 96e37c566240..f59d9948f98e 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -215,7 +215,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
 
   for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
     unsigned SrcReg = MI.getOperand(I).getReg();
-    unsigned SrcSubReg = MI.getOperand(I).getReg();
+    unsigned SrcSubReg = MI.getOperand(I).getSubReg();
 
     const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
     assert(TRI->isSGPRClass(SrcRC) &&
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 02a39307e74e..6230d1e28b74 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -334,12 +334,20 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
           !MRI.hasOneUse(MI.getOperand(0).getReg()))
         continue;
 
-      // FIXME: Fold operands with subregs.
       if (OpToFold.isReg() &&
-          (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) ||
-           OpToFold.getSubReg()))
+          !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))
         continue;
 
+      // Prevent folding operands backwards in the function. For example,
+      // the COPY opcode must not be replaced by 1 in this example:
+      //
+      //    %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3
+      //    ...
+      //    %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use>
+      MachineOperand &Dst = MI.getOperand(0);
+      if (Dst.isReg() &&
+          !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
+        continue;
 
       // We need mutate the operands of new mov instructions to add implicit
       // uses of EXEC, but adding them invalidates the use_iterator, so defer
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0e043cb47da7..544867513d9c 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -259,7 +259,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
   setTargetDAGCombine(ISD::SMAX);
   setTargetDAGCombine(ISD::UMIN);
   setTargetDAGCombine(ISD::UMAX);
-  setTargetDAGCombine(ISD::SELECT_CC);
   setTargetDAGCombine(ISD::SETCC);
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::OR);
@@ -598,18 +597,20 @@ SDValue SITargetLowering::LowerFormalArguments(
 
     // First check if it's a PS input addr
     if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() &&
-        !Arg.Flags.isByVal()) {
+        !Arg.Flags.isByVal() && PSInputNum <= 15) {
 
-      assert((PSInputNum <= 15) && "Too many PS inputs!");
-
-      if (!Arg.Used) {
+      if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
         // We can safely skip PS inputs
         Skipped.set(i);
         ++PSInputNum;
         continue;
       }
 
-      Info->PSInputAddr |= 1 << PSInputNum++;
+      Info->markPSInputAllocated(PSInputNum);
+      if (Arg.Used)
+        Info->PSInputEna |= 1 << PSInputNum;
+
+      ++PSInputNum;
     }
 
     // Second split vertices into their elements
@@ -639,11 +640,25 @@ SDValue SITargetLowering::LowerFormalArguments(
                  *DAG.getContext());
 
   // At least one interpolation mode must be enabled or else the GPU will hang.
+  //
+  // Check PSInputAddr instead of PSInputEna. The idea is that if the user set
+  // PSInputAddr, the user wants to enable some bits after the compilation
+  // based on run-time states. Since we can't know what the final PSInputEna
+  // will look like, so we shouldn't do anything here and the user should take
+  // responsibility for the correct programming.
+  //
+  // Otherwise, the following restrictions apply:
+  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
+  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
+  //   enabled too.
   if (Info->getShaderType() == ShaderType::PIXEL &&
-      (Info->PSInputAddr & 0x7F) == 0) {
-    Info->PSInputAddr |= 1;
+      ((Info->getPSInputAddr() & 0x7F) == 0 ||
+       ((Info->getPSInputAddr() & 0xF) == 0 &&
+	Info->isPSInputAllocated(11)))) {
     CCInfo.AllocateReg(AMDGPU::VGPR0);
     CCInfo.AllocateReg(AMDGPU::VGPR1);
+    Info->markPSInputAllocated(0);
+    Info->PSInputEna |= 1;
   }
 
   if (Info->getShaderType() == ShaderType::COMPUTE) {
@@ -872,6 +887,97 @@ SDValue SITargetLowering::LowerFormalArguments(
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
 }
 
+SDValue SITargetLowering::LowerReturn(SDValue Chain,
+                                      CallingConv::ID CallConv,
+                                      bool isVarArg,
+                                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                      const SmallVectorImpl<SDValue> &OutVals,
+                                      SDLoc DL, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+  if (Info->getShaderType() == ShaderType::COMPUTE)
+    return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
+                                             OutVals, DL, DAG);
+
+  Info->setIfReturnsVoid(Outs.size() == 0);
+
+  SmallVector<ISD::OutputArg, 48> Splits;
+  SmallVector<SDValue, 48> SplitVals;
+
+  // Split vectors into their elements.
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    const ISD::OutputArg &Out = Outs[i];
+
+    if (Out.VT.isVector()) {
+      MVT VT = Out.VT.getVectorElementType();
+      ISD::OutputArg NewOut = Out;
+      NewOut.Flags.setSplit();
+      NewOut.VT = VT;
+
+      // We want the original number of vector elements here, e.g.
+      // three or five, not four or eight.
+      unsigned NumElements = Out.ArgVT.getVectorNumElements();
+
+      for (unsigned j = 0; j != NumElements; ++j) {
+        SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
+                                   DAG.getConstant(j, DL, MVT::i32));
+        SplitVals.push_back(Elem);
+        Splits.push_back(NewOut);
+        NewOut.PartOffset += NewOut.VT.getStoreSize();
+      }
+    } else {
+      SplitVals.push_back(OutVals[i]);
+      Splits.push_back(Out);
+    }
+  }
+
+  // CCValAssign - represent the assignment of the return value to a location.
+  SmallVector<CCValAssign, 48> RVLocs;
+
+  // CCState - Info about the registers and stack slots.
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+
+  // Analyze outgoing return values.
+  AnalyzeReturn(CCInfo, Splits);
+
+  SDValue Flag;
+  SmallVector<SDValue, 48> RetOps;
+  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0, realRVLocIdx = 0;
+       i != RVLocs.size();
+       ++i, ++realRVLocIdx) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    SDValue Arg = SplitVals[realRVLocIdx];
+
+    // Copied from other backends.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      break;
+    }
+
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  // Update chain and glue.
+  RetOps[0] = Chain;
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps);
+}
+
 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     MachineInstr * MI, MachineBasicBlock * BB) const {
 
@@ -1158,6 +1264,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   switch (IntrinsicID) {
   case Intrinsic::amdgcn_dispatch_ptr:
+    if (!Subtarget->isAmdHsaOS()) {
+      DiagnosticInfoUnsupported BadIntrin(*MF.getFunction(),
+                                          "hsa intrinsic without hsa target");
+      DAG.getContext()->diagnose(BadIntrin);
+      return DAG.getUNDEF(VT);
+    }
+
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT);
 
@@ -2027,7 +2140,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
 
   case ISD::UINT_TO_FP: {
     return performUCharToFloatCombine(N, DCI);
-
+  }
   case ISD::FADD: {
     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
       break;
@@ -2109,7 +2222,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
 
     break;
   }
-  }
   case ISD::LOAD:
   case ISD::STORE:
   case ISD::ATOMIC_LOAD:
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
index e2f8cb19d6be..f01b2c0d09f3 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -95,6 +95,13 @@ public:
                                SDLoc DL, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const override;
 
+  SDValue LowerReturn(SDValue Chain,
+                      CallingConv::ID CallConv,
+                      bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals,
+                      SDLoc DL, SelectionDAG &DAG) const override;
+
   MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
                                       MachineBasicBlock * BB) const override;
   bool enableAggressiveFMAFusion(EVT VT) const override;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
index 821aada526c7..94e614750d2f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -84,6 +84,9 @@ private:
 
   bool LastInstWritesM0;
 
+  /// \brief Whether the machine function returns void
+  bool ReturnsVoid;
+
   /// \brief Get increment/decrement amount for this instruction.
   Counters getHwCounts(MachineInstr &MI);
 
@@ -322,7 +325,9 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
                                const Counters &Required) {
 
   // End of program? No need to wait on anything
-  if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM)
+  // A function not returning void needs to wait, because other bytecode will
+  // be appended after it and we don't know what it will be.
+  if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
     return false;
 
   // Figure out if the async instructions execute in order
@@ -465,6 +470,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
   LastIssued = ZeroCounts;
   LastOpcodeType = OTHER;
   LastInstWritesM0 = false;
+  ReturnsVoid = MF.getInfo<SIMachineFunctionInfo>()->returnsVoid();
 
   memset(&UsedRegs, 0, sizeof(UsedRegs));
   memset(&DefinedRegs, 0, sizeof(DefinedRegs));
@@ -488,6 +494,14 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
 
     // Wait for everything at the end of the MBB
     Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
+
+    // Functions returning something shouldn't contain S_ENDPGM, because other
+    // bytecode will be appended after it.
+    if (!ReturnsVoid) {
+      MachineBasicBlock::iterator I = MBB.getFirstTerminator();
+      if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM)
+        I->eraseFromParent();
+    }
   }
 
   return Changes;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index a08a5a8fed36..1e10d25e8fb7 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1777,6 +1777,10 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
     MRI.getRegClass(Reg) :
     RI.getPhysRegClass(Reg);
 
+  const SIRegisterInfo *TRI =
+      static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
+  RC = TRI->getSubRegClass(RC, MO.getSubReg());
+
   // In order to be legal, the common sub-class must be equal to the
   // class of the current operand.  For example:
   //
@@ -3075,3 +3079,15 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
 
   return Rsrc23;
 }
+
+bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr *MI) const {
+  unsigned Opc = MI->getOpcode();
+
+  return isSMRD(Opc);
+}
+
+bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr *MI) const {
+  unsigned Opc = MI->getOpcode();
+
+  return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 307ef67ed263..cce1ae725611 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -462,6 +462,9 @@ public:
 
   uint64_t getDefaultRsrcDataFormat() const;
   uint64_t getScratchRsrcWords23() const;
+
+  bool isLowLatencyInstruction(const MachineInstr *MI) const;
+  bool isHighLatencyInstruction(const MachineInstr *MI) const;
 };
 
 namespace AMDGPU {
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
index b7df058b7c0c..89692ab71f4d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -144,7 +144,7 @@ defm S_FF1_I32_B32 : SOP1_32 <sop1<0x13, 0x10>, "s_ff1_i32_b32",
 defm S_FF1_I32_B64 : SOP1_32_64 <sop1<0x14, 0x11>, "s_ff1_i32_b64", []>;
 
 defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32",
-  [(set i32:$dst, (ctlz_zero_undef i32:$src0))]
+  [(set i32:$dst, (AMDGPUffbh_u32 i32:$src0))]
 >;
 
 defm S_FLBIT_I32_B64 : SOP1_32_64 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index bf15516bea7b..49677fc2b0a3 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -46,8 +46,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
     WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
     PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
-    LDSWaveSpillSize(0),
     PSInputAddr(0),
+    ReturnsVoid(true),
+    LDSWaveSpillSize(0),
+    PSInputEna(0),
     NumUserSGPRs(0),
     NumSystemSGPRs(0),
     HasSpilledSGPRs(false),
@@ -72,6 +74,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
   const Function *F = MF.getFunction();
 
+  PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
+
   const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
 
   if (getShaderType() == ShaderType::COMPUTE)
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 9c528d63bd0e..846ee5de057d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -57,10 +57,14 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction {
   unsigned WorkGroupInfoSystemSGPR;
   unsigned PrivateSegmentWaveByteOffsetSystemSGPR;
 
+  // Graphics info.
+  unsigned PSInputAddr;
+  bool ReturnsVoid;
+
 public:
   // FIXME: Make private
   unsigned LDSWaveSpillSize;
-  unsigned PSInputAddr;
+  unsigned PSInputEna;
   std::map<unsigned, unsigned> LaneVGPRs;
   unsigned ScratchOffsetReg;
   unsigned NumUserSGPRs;
@@ -273,6 +277,26 @@ public:
     HasSpilledVGPRs = Spill;
   }
 
+  unsigned getPSInputAddr() const {
+    return PSInputAddr;
+  }
+
+  bool isPSInputAllocated(unsigned Index) const {
+    return PSInputAddr & (1 << Index);
+  }
+
+  void markPSInputAllocated(unsigned Index) {
+    PSInputAddr |= 1 << Index;
+  }
+
+  bool returnsVoid() const {
+    return ReturnsVoid;
+  }
+
+  void setIfReturnsVoid(bool Value) {
+    ReturnsVoid = Value;
+  }
+
   unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const;
 };
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
new file mode 100644
index 000000000000..1cfa98430020
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -0,0 +1,1968 @@
+//===-- SIMachineScheduler.cpp - SI Scheduler Interface -*- C++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#include "SIMachineScheduler.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+// This scheduler implements a different scheduling algorithm than
+// GenericScheduler.
+//
+// There are several specific architecture behaviours that can't be modelled
+// for GenericScheduler:
+// . When accessing the result of an SGPR load instruction, you have to wait
+// for all the SGPR load instructions before your current instruction to
+// have finished.
+// . When accessing the result of an VGPR load instruction, you have to wait
+// for all the VGPR load instructions previous to the VGPR load instruction
+// you are interested in to finish.
+// . The less the register pressure, the best load latencies are hidden
+//
+// Moreover some specifities (like the fact a lot of instructions in the shader
+// have few dependencies) makes the generic scheduler have some unpredictable
+// behaviours. For example when register pressure becomes high, it can either
+// manage to prevent register pressure from going too high, or it can
+// increase register pressure even more than if it hadn't taken register
+// pressure into account.
+//
+// Also some other bad behaviours are generated, like loading at the beginning
+// of the shader a constant in VGPR you won't need until the end of the shader.
+//
+// The scheduling problem for SI can distinguish three main parts:
+// . Hiding high latencies (texture sampling, etc)
+// . Hiding low latencies (SGPR constant loading, etc)
+// . Keeping register usage low for better latency hiding and general
+//   performance
+//
+// Some other things can also affect performance, but are hard to predict
+// (cache usage, the fact the HW can issue several instructions from different
+// wavefronts if different types, etc)
+//
+// This scheduler tries to solve the scheduling problem by dividing it into
+// simpler sub-problems. It divides the instructions into blocks, schedules
+// locally inside the blocks where it takes care of low latencies, and then
+// chooses the order of the blocks by taking care of high latencies.
+// Dividing the instructions into blocks helps control keeping register
+// usage low.
+//
+// First the instructions are put into blocks.
+//   We want the blocks help control register usage and hide high latencies
+//   later. To help control register usage, we typically want all local
+//   computations, when for example you create a result that can be comsummed
+//   right away, to be contained in a block. Block inputs and outputs would
+//   typically be important results that are needed in several locations of
+//   the shader. Since we do want blocks to help hide high latencies, we want
+//   the instructions inside the block to have a minimal set of dependencies
+//   on high latencies. It will make it easy to pick blocks to hide specific
+//   high latencies.
+//   The block creation algorithm is divided into several steps, and several
+//   variants can be tried during the scheduling process.
+//
+// Second the order of the instructions inside the blocks is choosen.
+//   At that step we do take into account only register usage and hiding
+//   low latency instructions
+//
+// Third the block order is choosen, there we try to hide high latencies
+// and keep register usage low.
+//
+// After the third step, a pass is done to improve the hiding of low
+// latencies.
+//
+// Actually when talking about 'low latency' or 'high latency' it includes
+// both the latency to get the cache (or global mem) data go to the register,
+// and the bandwith limitations.
+// Increasing the number of active wavefronts helps hide the former, but it
+// doesn't solve the latter, thus why even if wavefront count is high, we have
+// to try have as many instructions hiding high latencies as possible.
+// The OpenCL doc says for example latency of 400 cycles for a global mem access,
+// which is hidden by 10 instructions if the wavefront count is 10.
+
+// Some figures taken from AMD docs:
+// Both texture and constant L1 caches are 4-way associative with 64 bytes
+// lines.
+// Constant cache is shared with 4 CUs.
+// For texture sampling, the address generation unit receives 4 texture
+// addresses per cycle, thus we could expect texture sampling latency to be
+// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
+// instructions in a wavefront group are executed every 4 cycles),
+// or 16 instructions if the other wavefronts associated to the 3 other VALUs
+// of the CU do texture sampling too. (Don't take these figures too seriously,
+// as I'm not 100% sure of the computation)
+// Data exports should get similar latency.
+// For constant loading, the cache is shader with 4 CUs.
+// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
+// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
+// It means a simple s_buffer_load should take one instruction to hide, as
+// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
+// cache line.
+//
+// As of today the driver doesn't preload the constants in cache, thus the
+// first loads get extra latency. The doc says global memory access can be
+// 300-600 cycles. We do not specially take that into account when scheduling
+// As we expect the driver to be able to preload the constants soon.
+
+
+// common code //
+
+#ifndef NDEBUG
+
+static const char *getReasonStr(SIScheduleCandReason Reason) {
+  switch (Reason) {
+  case NoCand:         return "NOCAND";
+  case RegUsage:       return "REGUSAGE";
+  case Latency:        return "LATENCY";
+  case Successor:      return "SUCCESSOR";
+  case Depth:          return "DEPTH";
+  case NodeOrder:      return "ORDER";
+  }
+  llvm_unreachable("Unknown reason!");
+}
+
+#endif
+
+static bool tryLess(int TryVal, int CandVal,
+                    SISchedulerCandidate &TryCand,
+                    SISchedulerCandidate &Cand,
+                    SIScheduleCandReason Reason) {
+  if (TryVal < CandVal) {
+    TryCand.Reason = Reason;
+    return true;
+  }
+  if (TryVal > CandVal) {
+    if (Cand.Reason > Reason)
+      Cand.Reason = Reason;
+    return true;
+  }
+  Cand.setRepeat(Reason);
+  return false;
+}
+
+static bool tryGreater(int TryVal, int CandVal,
+                       SISchedulerCandidate &TryCand,
+                       SISchedulerCandidate &Cand,
+                       SIScheduleCandReason Reason) {
+  if (TryVal > CandVal) {
+    TryCand.Reason = Reason;
+    return true;
+  }
+  if (TryVal < CandVal) {
+    if (Cand.Reason > Reason)
+      Cand.Reason = Reason;
+    return true;
+  }
+  Cand.setRepeat(Reason);
+  return false;
+}
+
+// SIScheduleBlock //
+
+void SIScheduleBlock::addUnit(SUnit *SU) {
+  NodeNum2Index[SU->NodeNum] = SUnits.size();
+  SUnits.push_back(SU);
+}
+
+#ifndef NDEBUG
+
+void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
+
+  dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
+  dbgs() << '\n';
+}
+#endif
+
+void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
+                                          SISchedCandidate &TryCand) {
+  // Initialize the candidate if needed.
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return;
+  }
+
+  if (Cand.SGPRUsage > 60 &&
+      tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage))
+    return;
+
+  // Schedule low latency instructions as top as possible.
+  // Order of priority is:
+  // . Low latency instructions which do not depend on other low latency
+  //   instructions we haven't waited for
+  // . Other instructions which do not depend on low latency instructions
+  //   we haven't waited for
+  // . Low latencies
+  // . All other instructions
+  // Goal is to get: low latency instructions - independant instructions
+  //     - (eventually some more low latency instructions)
+  //     - instructions that depend on the first low latency instructions.
+  // If in the block there is a lot of constant loads, the SGPR usage
+  // could go quite high, thus above the arbitrary limit of 60 will encourage
+  // use the already loaded constants (in order to release some SGPRs) before
+  // loading more.
+  if (tryLess(TryCand.HasLowLatencyNonWaitedParent,
+              Cand.HasLowLatencyNonWaitedParent,
+              TryCand, Cand, SIScheduleCandReason::Depth))
+    return;
+
+  if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
+                 TryCand, Cand, SIScheduleCandReason::Depth))
+    return;
+
+  if (TryCand.IsLowLatency &&
+      tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
+              TryCand, Cand, SIScheduleCandReason::Depth))
+    return;
+
+  if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage))
+    return;
+
+  // Fall through to original instruction order.
+  if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
+    TryCand.Reason = NodeOrder;
+  }
+}
+
+SUnit* SIScheduleBlock::pickNode() {
+  SISchedCandidate TopCand;
+
+  for (SUnit* SU : TopReadySUs) {
+    SISchedCandidate TryCand;
+    std::vector<unsigned> pressure;
+    std::vector<unsigned> MaxPressure;
+    // Predict register usage after this instruction.
+    TryCand.SU = SU;
+    TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
+    TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
+    TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
+    TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
+    TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
+    TryCand.HasLowLatencyNonWaitedParent =
+      HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
+    tryCandidateTopDown(TopCand, TryCand);
+    if (TryCand.Reason != NoCand)
+      TopCand.setBest(TryCand);
+  }
+
+  return TopCand.SU;
+}
+
+
+// Schedule something valid.
+void SIScheduleBlock::fastSchedule() {
+  TopReadySUs.clear();
+  if (Scheduled)
+    undoSchedule();
+
+  for (SUnit* SU : SUnits) {
+    if (!SU->NumPredsLeft)
+      TopReadySUs.push_back(SU);
+  }
+
+  while (!TopReadySUs.empty()) {
+    SUnit *SU = TopReadySUs[0];
+    ScheduledSUnits.push_back(SU);
+    nodeScheduled(SU);
+  }
+
+  Scheduled = true;
+}
+
+// Returns if the register was set between first and last.
+static bool isDefBetween(unsigned Reg,
+                           SlotIndex First, SlotIndex Last,
+                           const MachineRegisterInfo *MRI,
+                           const LiveIntervals *LIS) {
+  for (MachineRegisterInfo::def_instr_iterator
+       UI = MRI->def_instr_begin(Reg),
+       UE = MRI->def_instr_end(); UI != UE; ++UI) {
+    const MachineInstr* MI = &*UI;
+    if (MI->isDebugValue())
+      continue;
+    SlotIndex InstSlot = LIS->getInstructionIndex(MI).getRegSlot();
+    if (InstSlot >= First && InstSlot <= Last)
+      return true;
+  }
+  return false;
+}
+
+void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
+                                      MachineBasicBlock::iterator EndBlock) {
+  IntervalPressure Pressure, BotPressure;
+  RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
+  LiveIntervals *LIS = DAG->getLIS();
+  MachineRegisterInfo *MRI = DAG->getMRI();
+  DAG->initRPTracker(TopRPTracker);
+  DAG->initRPTracker(BotRPTracker);
+  DAG->initRPTracker(RPTracker);
+
+  // Goes though all SU. RPTracker captures what had to be alive for the SUs
+  // to execute, and what is still alive at the end.
+  for (SUnit* SU : ScheduledSUnits) {
+    RPTracker.setPos(SU->getInstr());
+    RPTracker.advance();
+  }
+
+  // Close the RPTracker to finalize live ins/outs.
+  RPTracker.closeRegion();
+
+  // Initialize the live ins and live outs.
+  TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
+  BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
+
+  // Do not Track Physical Registers, because it messes up.
+  for (unsigned Reg : RPTracker.getPressure().LiveInRegs) {
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      LiveInRegs.insert(Reg);
+  }
+  LiveOutRegs.clear();
+  // There is several possibilities to distinguish:
+  // 1) Reg is not input to any instruction in the block, but is output of one
+  // 2) 1) + read in the block and not needed after it
+  // 3) 1) + read in the block but needed in another block
+  // 4) Reg is input of an instruction but another block will read it too
+  // 5) Reg is input of an instruction and then rewritten in the block.
+  //    result is not read in the block (implies used in another block)
+  // 6) Reg is input of an instruction and then rewritten in the block.
+  //    result is read in the block and not needed in another block
+  // 7) Reg is input of an instruction and then rewritten in the block.
+  //    result is read in the block but also needed in another block
+  // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
+  // We want LiveOutRegs to contain only Regs whose content will be read after
+  // in another block, and whose content was written in the current block,
+  // that is we want it to get 1, 3, 5, 7
+  // Since we made the MIs of a block to be packed all together before
+  // scheduling, then the LiveIntervals were correct, and the RPTracker was
+  // able to correctly handle 5 vs 6, 2 vs 3.
+  // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
+  // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
+  // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
+  // The use of findDefBetween removes the case 4.
+  for (unsigned Reg : RPTracker.getPressure().LiveOutRegs) {
+    if (TargetRegisterInfo::isVirtualRegister(Reg) &&
+        isDefBetween(Reg, LIS->getInstructionIndex(BeginBlock).getRegSlot(),
+                       LIS->getInstructionIndex(EndBlock).getRegSlot(),
+                       MRI, LIS)) {
+      LiveOutRegs.insert(Reg);
+    }
+  }
+
+  // Pressure = sum_alive_registers register size
+  // Internally llvm will represent some registers as big 128 bits registers
+  // for example, but they actually correspond to 4 actual 32 bits registers.
+  // Thus Pressure is not equal to num_alive_registers * constant.
+  LiveInPressure = TopPressure.MaxSetPressure;
+  LiveOutPressure = BotPressure.MaxSetPressure;
+
+  // Prepares TopRPTracker for top down scheduling.
+  TopRPTracker.closeTop();
+}
+
+void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
+                               MachineBasicBlock::iterator EndBlock) {
+  if (!Scheduled)
+    fastSchedule();
+
+  // PreScheduling phase to set LiveIn and LiveOut.
+  initRegPressure(BeginBlock, EndBlock);
+  undoSchedule();
+
+  // Schedule for real now.
+
+  TopReadySUs.clear();
+
+  for (SUnit* SU : SUnits) {
+    if (!SU->NumPredsLeft)
+      TopReadySUs.push_back(SU);
+  }
+
+  while (!TopReadySUs.empty()) {
+    SUnit *SU = pickNode();
+    ScheduledSUnits.push_back(SU);
+    TopRPTracker.setPos(SU->getInstr());
+    TopRPTracker.advance();
+    nodeScheduled(SU);
+  }
+
+  // TODO: compute InternalAdditionnalPressure.
+  InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
+
+  // Check everything is right.
+#ifndef NDEBUG
+  assert(SUnits.size() == ScheduledSUnits.size() &&
+            TopReadySUs.empty());
+  for (SUnit* SU : SUnits) {
+    assert(SU->isScheduled &&
+              SU->NumPredsLeft == 0);
+  }
+#endif
+
+  Scheduled = true;
+}
+
+void SIScheduleBlock::undoSchedule() {
+  for (SUnit* SU : SUnits) {
+    SU->isScheduled = false;
+    for (SDep& Succ : SU->Succs) {
+      if (BC->isSUInBlock(Succ.getSUnit(), ID))
+        undoReleaseSucc(SU, &Succ);
+    }
+  }
+  HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
+  ScheduledSUnits.clear();
+  Scheduled = false;
+}
+
+void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
+  SUnit *SuccSU = SuccEdge->getSUnit();
+
+  if (SuccEdge->isWeak()) {
+    ++SuccSU->WeakPredsLeft;
+    return;
+  }
+  ++SuccSU->NumPredsLeft;
+}
+
+void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
+  SUnit *SuccSU = SuccEdge->getSUnit();
+
+  if (SuccEdge->isWeak()) {
+    --SuccSU->WeakPredsLeft;
+    return;
+  }
+#ifndef NDEBUG
+  if (SuccSU->NumPredsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    SuccSU->dump(DAG);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(nullptr);
+  }
+#endif
+
+  --SuccSU->NumPredsLeft;
+}
+
+/// Release Successors of the SU that are in the block or not.
+void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
+  for (SDep& Succ : SU->Succs) {
+    SUnit *SuccSU = Succ.getSUnit();
+
+    if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
+      continue;
+
+    releaseSucc(SU, &Succ);
+    if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
+      TopReadySUs.push_back(SuccSU);
+  }
+}
+
+void SIScheduleBlock::nodeScheduled(SUnit *SU) {
+  // Is in TopReadySUs
+  assert (!SU->NumPredsLeft);
+  std::vector<SUnit*>::iterator I =
+    std::find(TopReadySUs.begin(), TopReadySUs.end(), SU);
+  if (I == TopReadySUs.end()) {
+    dbgs() << "Data Structure Bug in SI Scheduler\n";
+    llvm_unreachable(nullptr);
+  }
+  TopReadySUs.erase(I);
+
+  releaseSuccessors(SU, true);
+  // Scheduling this node will trigger a wait,
+  // thus propagate to other instructions that they do not need to wait either.
+  if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
+    HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
+
+  if (DAG->IsLowLatencySU[SU->NodeNum]) {
+     for (SDep& Succ : SU->Succs) {
+      std::map<unsigned, unsigned>::iterator I =
+        NodeNum2Index.find(Succ.getSUnit()->NodeNum);
+      if (I != NodeNum2Index.end())
+        HasLowLatencyNonWaitedParent[I->second] = 1;
+    }
+  }
+  SU->isScheduled = true;
+}
+
+void SIScheduleBlock::finalizeUnits() {
+  // We remove links from outside blocks to enable scheduling inside the block.
+  for (SUnit* SU : SUnits) {
+    releaseSuccessors(SU, false);
+    if (DAG->IsHighLatencySU[SU->NodeNum])
+      HighLatencyBlock = true;
+  }
+  HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
+}
+
+// we maintain ascending order of IDs
+void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
+  unsigned PredID = Pred->getID();
+
+  // Check if not already predecessor.
+  for (SIScheduleBlock* P : Preds) {
+    if (PredID == P->getID())
+      return;
+  }
+  Preds.push_back(Pred);
+
+#ifndef NDEBUG
+  for (SIScheduleBlock* S : Succs) {
+    if (PredID == S->getID())
+      assert(!"Loop in the Block Graph!\n");
+  }
+#endif
+}
+
+void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) {
+  unsigned SuccID = Succ->getID();
+
+  // Check if not already predecessor.
+  for (SIScheduleBlock* S : Succs) {
+    if (SuccID == S->getID())
+      return;
+  }
+  if (Succ->isHighLatencyBlock())
+    ++NumHighLatencySuccessors;
+  Succs.push_back(Succ);
+#ifndef NDEBUG
+  for (SIScheduleBlock* P : Preds) {
+    if (SuccID == P->getID())
+      assert("Loop in the Block Graph!\n");
+  }
+#endif
+}
+
+#ifndef NDEBUG
+void SIScheduleBlock::printDebug(bool full) {
+  dbgs() << "Block (" << ID << ")\n";
+  if (!full)
+    return;
+
+  dbgs() << "\nContains High Latency Instruction: "
+         << HighLatencyBlock << '\n';
+  dbgs() << "\nDepends On:\n";
+  for (SIScheduleBlock* P : Preds) {
+    P->printDebug(false);
+  }
+
+  dbgs() << "\nSuccessors:\n";
+  for (SIScheduleBlock* S : Succs) {
+    S->printDebug(false);
+  }
+
+  if (Scheduled) {
+    dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
+           << LiveInPressure[DAG->getVGPRSetID()] << '\n';
+    dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
+           << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
+    dbgs() << "LiveIns:\n";
+    for (unsigned Reg : LiveInRegs)
+      dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+
+    dbgs() << "\nLiveOuts:\n";
+    for (unsigned Reg : LiveOutRegs)
+      dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+  }
+
+  dbgs() << "\nInstructions:\n";
+  if (!Scheduled) {
+    for (SUnit* SU : SUnits) {
+      SU->dump(DAG);
+    }
+  } else {
+    for (SUnit* SU : SUnits) {
+      SU->dump(DAG);
+    }
+  }
+
+   dbgs() << "///////////////////////\n";
+}
+
+#endif
+
+// SIScheduleBlockCreator //
+
+SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
+DAG(DAG) {
+}
+
+SIScheduleBlockCreator::~SIScheduleBlockCreator() {
+}
+
+SIScheduleBlocks
+SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
+  std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
+    Blocks.find(BlockVariant);
+  if (B == Blocks.end()) {
+    SIScheduleBlocks Res;
+    createBlocksForVariant(BlockVariant);
+    topologicalSort();
+    scheduleInsideBlocks();
+    fillStats();
+    Res.Blocks = CurrentBlocks;
+    Res.TopDownIndex2Block = TopDownIndex2Block;
+    Res.TopDownBlock2Index = TopDownBlock2Index;
+    Blocks[BlockVariant] = Res;
+    return Res;
+  } else {
+    return B->second;
+  }
+}
+
+bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
+  if (SU->NodeNum >= DAG->SUnits.size())
+    return false;
+  return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
+}
+
+void SIScheduleBlockCreator::colorHighLatenciesAlone() {
+  unsigned DAGSize = DAG->SUnits.size();
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    if (DAG->IsHighLatencySU[SU->NodeNum]) {
+      CurrentColoring[SU->NodeNum] = NextReservedID++;
+    }
+  }
+}
+
+void SIScheduleBlockCreator::colorHighLatenciesGroups() {
+  unsigned DAGSize = DAG->SUnits.size();
+  unsigned NumHighLatencies = 0;
+  unsigned GroupSize;
+  unsigned Color = NextReservedID;
+  unsigned Count = 0;
+  std::set<unsigned> FormingGroup;
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    if (DAG->IsHighLatencySU[SU->NodeNum])
+      ++NumHighLatencies;
+  }
+
+  if (NumHighLatencies == 0)
+    return;
+
+  if (NumHighLatencies <= 6)
+    GroupSize = 2;
+  else if (NumHighLatencies <= 12)
+    GroupSize = 3;
+  else
+    GroupSize = 4;
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    if (DAG->IsHighLatencySU[SU->NodeNum]) {
+      unsigned CompatibleGroup = true;
+      unsigned ProposedColor = Color;
+      for (unsigned j : FormingGroup) {
+        // TODO: Currently CompatibleGroup will always be false,
+        // because the graph enforces the load order. This
+        // can be fixed, but as keeping the load order is often
+        // good for performance that causes a performance hit (both
+        // the default scheduler and this scheduler).
+        // When this scheduler determines a good load order,
+        // this can be fixed.
+        if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) ||
+            !DAG->canAddEdge(&DAG->SUnits[j], SU))
+          CompatibleGroup = false;
+      }
+      if (!CompatibleGroup || ++Count == GroupSize) {
+        FormingGroup.clear();
+        Color = ++NextReservedID;
+        if (!CompatibleGroup) {
+          ProposedColor = Color;
+          FormingGroup.insert(SU->NodeNum);
+        }
+        Count = 0;
+      } else {
+        FormingGroup.insert(SU->NodeNum);
+      }
+      CurrentColoring[SU->NodeNum] = ProposedColor;
+    }
+  }
+}
+
+void SIScheduleBlockCreator::colorComputeReservedDependencies() {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::map<std::set<unsigned>, unsigned> ColorCombinations;
+
+  CurrentTopDownReservedDependencyColoring.clear();
+  CurrentBottomUpReservedDependencyColoring.clear();
+
+  CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
+  CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
+
+  // Traverse TopDown, and give different colors to SUs depending
+  // on which combination of High Latencies they depend on.
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->TopDownIndex2SU[i]];
+    std::set<unsigned> SUColors;
+
+    // Already given.
+    if (CurrentColoring[SU->NodeNum]) {
+      CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
+        CurrentColoring[SU->NodeNum];
+      continue;
+    }
+
+   for (SDep& PredDep : SU->Preds) {
+      SUnit *Pred = PredDep.getSUnit();
+      if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
+        continue;
+      if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
+        SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
+    }
+    // Color 0 by default.
+    if (SUColors.empty())
+      continue;
+    // Same color than parents.
+    if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
+      CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
+        *SUColors.begin();
+    else {
+      std::map<std::set<unsigned>, unsigned>::iterator Pos =
+        ColorCombinations.find(SUColors);
+      if (Pos != ColorCombinations.end()) {
+          CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
+      } else {
+        CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
+          NextNonReservedID;
+        ColorCombinations[SUColors] = NextNonReservedID++;
+      }
+    }
+  }
+
+  ColorCombinations.clear();
+
+  // Same as before, but BottomUp.
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    std::set<unsigned> SUColors;
+
+    // Already given.
+    if (CurrentColoring[SU->NodeNum]) {
+      CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
+        CurrentColoring[SU->NodeNum];
+      continue;
+    }
+
+    for (SDep& SuccDep : SU->Succs) {
+      SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
+        SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
+    }
+    // Keep color 0.
+    if (SUColors.empty())
+      continue;
+    // Same color than parents.
+    if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
+      CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
+        *SUColors.begin();
+    else {
+      std::map<std::set<unsigned>, unsigned>::iterator Pos =
+        ColorCombinations.find(SUColors);
+      if (Pos != ColorCombinations.end()) {
+        CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
+      } else {
+        CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
+          NextNonReservedID;
+        ColorCombinations[SUColors] = NextNonReservedID++;
+      }
+    }
+  }
+}
+
+void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
+
+  // Every combination of colors given by the top down
+  // and bottom up Reserved node dependency
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    std::pair<unsigned, unsigned> SUColors;
+
+    // High latency instructions: already given.
+    if (CurrentColoring[SU->NodeNum])
+      continue;
+
+    SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
+    SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
+
+    std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
+      ColorCombinations.find(SUColors);
+    if (Pos != ColorCombinations.end()) {
+      CurrentColoring[SU->NodeNum] = Pos->second;
+    } else {
+      CurrentColoring[SU->NodeNum] = NextNonReservedID;
+      ColorCombinations[SUColors] = NextNonReservedID++;
+    }
+  }
+}
+
+void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::vector<int> PendingColoring = CurrentColoring;
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    std::set<unsigned> SUColors;
+    std::set<unsigned> SUColorsPending;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
+        CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+      SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
+          CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
+        SUColors.insert(CurrentColoring[Succ->NodeNum]);
+      SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
+    }
+    if (SUColors.size() == 1 && SUColorsPending.size() == 1)
+      PendingColoring[SU->NodeNum] = *SUColors.begin();
+    else // TODO: Attribute new colors depending on color
+         // combination of children.
+      PendingColoring[SU->NodeNum] = NextNonReservedID++;
+  }
+  CurrentColoring = PendingColoring;
+}
+
+
+void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
+  unsigned DAGSize = DAG->SUnits.size();
+  unsigned PreviousColor;
+  std::set<unsigned> SeenColors;
+
+  if (DAGSize <= 1)
+    return;
+
+  PreviousColor = CurrentColoring[0];
+
+  for (unsigned i = 1, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    unsigned CurrentColor = CurrentColoring[i];
+    unsigned PreviousColorSave = PreviousColor;
+    assert(i == SU->NodeNum);
+
+    if (CurrentColor != PreviousColor)
+      SeenColors.insert(PreviousColor);
+    PreviousColor = CurrentColor;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    if (SeenColors.find(CurrentColor) == SeenColors.end())
+      continue;
+
+    if (PreviousColorSave != CurrentColor)
+      CurrentColoring[i] = NextNonReservedID++;
+    else
+      CurrentColoring[i] = CurrentColoring[i-1];
+  }
+}
+
+void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
+  unsigned DAGSize = DAG->SUnits.size();
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    std::set<unsigned> SUColors;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    // No predecessor: Vgpr constant loading.
+    // Low latency instructions usually have a predecessor (the address)
+    if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+      SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      SUColors.insert(CurrentColoring[Succ->NodeNum]);
+    }
+    if (SUColors.size() == 1)
+      CurrentColoring[SU->NodeNum] = *SUColors.begin();
+  }
+}
+
+void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
+  unsigned DAGSize = DAG->SUnits.size();
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    std::set<unsigned> SUColors;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+       SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      SUColors.insert(CurrentColoring[Succ->NodeNum]);
+    }
+    if (SUColors.size() == 1)
+      CurrentColoring[SU->NodeNum] = *SUColors.begin();
+  }
+}
+
+void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
+  unsigned DAGSize = DAG->SUnits.size();
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    std::set<unsigned> SUColors;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+       SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      SUColors.insert(CurrentColoring[Succ->NodeNum]);
+    }
+    if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
+      CurrentColoring[SU->NodeNum] = *SUColors.begin();
+  }
+}
+
+void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::map<unsigned, unsigned> ColorCount;
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    unsigned color = CurrentColoring[SU->NodeNum];
+    std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color);
+      if (Pos != ColorCount.end()) {
+        ++ColorCount[color];
+      } else {
+        ColorCount[color] = 1;
+      }
+  }
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    unsigned color = CurrentColoring[SU->NodeNum];
+    std::set<unsigned> SUColors;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    if (ColorCount[color] > 1)
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+       SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      SUColors.insert(CurrentColoring[Succ->NodeNum]);
+    }
+    if (SUColors.size() == 1 && *SUColors.begin() != color) {
+      --ColorCount[color];
+      CurrentColoring[SU->NodeNum] = *SUColors.begin();
+      ++ColorCount[*SUColors.begin()];
+    }
+  }
+}
+
+void SIScheduleBlockCreator::cutHugeBlocks() {
+  // TODO
+}
+
+void SIScheduleBlockCreator::regroupNoUserInstructions() {
+  unsigned DAGSize = DAG->SUnits.size();
+  int GroupID = NextNonReservedID++;
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    bool hasSuccessor = false;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+       SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      hasSuccessor = true;
+    }
+    if (!hasSuccessor)
+      CurrentColoring[SU->NodeNum] = GroupID;
+  }
+}
+
+void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::map<unsigned,unsigned> RealID;
+
+  CurrentBlocks.clear();
+  CurrentColoring.clear();
+  CurrentColoring.resize(DAGSize, 0);
+  Node2CurrentBlock.clear();
+
+  // Restore links previous scheduling variant has overridden.
+  DAG->restoreSULinksLeft();
+
+  NextReservedID = 1;
+  NextNonReservedID = DAGSize + 1;
+
+  DEBUG(dbgs() << "Coloring the graph\n");
+
+  if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
+    colorHighLatenciesGroups();
+  else
+    colorHighLatenciesAlone();
+  colorComputeReservedDependencies();
+  colorAccordingToReservedDependencies();
+  colorEndsAccordingToDependencies();
+  if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
+    colorForceConsecutiveOrderInGroup();
+  regroupNoUserInstructions();
+  colorMergeConstantLoadsNextGroup();
+  colorMergeIfPossibleNextGroupOnlyForReserved();
+
+  // Put SUs of same color into same block
+  Node2CurrentBlock.resize(DAGSize, -1);
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    unsigned Color = CurrentColoring[SU->NodeNum];
+    if (RealID.find(Color) == RealID.end()) {
+      int ID = CurrentBlocks.size();
+      BlockPtrs.push_back(
+        make_unique<SIScheduleBlock>(DAG, this, ID));
+      CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
+      RealID[Color] = ID;
+    }
+    CurrentBlocks[RealID[Color]]->addUnit(SU);
+    Node2CurrentBlock[SU->NodeNum] = RealID[Color];
+  }
+
+  // Build dependencies between blocks.
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    int SUID = Node2CurrentBlock[i];
+     for (SDep& SuccDep : SU->Succs) {
+       SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      if (Node2CurrentBlock[Succ->NodeNum] != SUID)
+        CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]);
+    }
+    for (SDep& PredDep : SU->Preds) {
+      SUnit *Pred = PredDep.getSUnit();
+      if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
+        continue;
+      if (Node2CurrentBlock[Pred->NodeNum] != SUID)
+        CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
+    }
+  }
+
+  // Free root and leafs of all blocks to enable scheduling inside them.
+  for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    Block->finalizeUnits();
+  }
+  DEBUG(
+    dbgs() << "Blocks created:\n\n";
+    for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+      SIScheduleBlock *Block = CurrentBlocks[i];
+      Block->printDebug(true);
+    }
+  );
+}
+
+// Two functions taken from Codegen/MachineScheduler.cpp
+
+/// If this iterator is a debug value, increment until reaching the End or a
+/// non-debug instruction.
+static MachineBasicBlock::const_iterator
+nextIfDebug(MachineBasicBlock::const_iterator I,
+            MachineBasicBlock::const_iterator End) {
+  for(; I != End; ++I) {
+    if (!I->isDebugValue())
+      break;
+  }
+  return I;
+}
+
+/// Non-const version.
+static MachineBasicBlock::iterator
+nextIfDebug(MachineBasicBlock::iterator I,
+            MachineBasicBlock::const_iterator End) {
+  // Cast the return value to nonconst MachineInstr, then cast to an
+  // instr_iterator, which does not check for null, finally return a
+  // bundle_iterator.
+  return MachineBasicBlock::instr_iterator(
+    const_cast<MachineInstr*>(
+      &*nextIfDebug(MachineBasicBlock::const_iterator(I), End)));
+}
+
+void SIScheduleBlockCreator::topologicalSort() {
+  unsigned DAGSize = CurrentBlocks.size();
+  std::vector<int> WorkList;
+
+  DEBUG(dbgs() << "Topological Sort\n");
+
+  WorkList.reserve(DAGSize);
+  TopDownIndex2Block.resize(DAGSize);
+  TopDownBlock2Index.resize(DAGSize);
+  BottomUpIndex2Block.resize(DAGSize);
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    unsigned Degree = Block->getSuccs().size();
+    TopDownBlock2Index[i] = Degree;
+    if (Degree == 0) {
+      WorkList.push_back(i);
+    }
+  }
+
+  int Id = DAGSize;
+  while (!WorkList.empty()) {
+    int i = WorkList.back();
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    WorkList.pop_back();
+    TopDownBlock2Index[i] = --Id;
+    TopDownIndex2Block[Id] = i;
+    for (SIScheduleBlock* Pred : Block->getPreds()) {
+      if (!--TopDownBlock2Index[Pred->getID()])
+        WorkList.push_back(Pred->getID());
+    }
+  }
+
+#ifndef NDEBUG
+  // Check correctness of the ordering.
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    for (SIScheduleBlock* Pred : Block->getPreds()) {
+      assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
+      "Wrong Top Down topological sorting");
+    }
+  }
+#endif
+
+  BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
+                                         TopDownIndex2Block.rend());
+}
+
+void SIScheduleBlockCreator::scheduleInsideBlocks() {
+  unsigned DAGSize = CurrentBlocks.size();
+
+  DEBUG(dbgs() << "\nScheduling Blocks\n\n");
+
+  // We do schedule a valid scheduling such that a Block corresponds
+  // to a range of instructions.
+  DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    Block->fastSchedule();
+  }
+
+  // Note: the following code, and the part restoring previous position
+  // is by far the most expensive operation of the Scheduler.
+
+  // Do not update CurrentTop.
+  MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
+  std::vector<MachineBasicBlock::iterator> PosOld;
+  std::vector<MachineBasicBlock::iterator> PosNew;
+  PosOld.reserve(DAG->SUnits.size());
+  PosNew.reserve(DAG->SUnits.size());
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    int BlockIndice = TopDownIndex2Block[i];
+    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
+    std::vector<SUnit*> SUs = Block->getScheduledUnits();
+
+    for (SUnit* SU : SUs) {
+      MachineInstr *MI = SU->getInstr();
+      MachineBasicBlock::iterator Pos = MI;
+      PosOld.push_back(Pos);
+      if (&*CurrentTopFastSched == MI) {
+        PosNew.push_back(Pos);
+        CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
+                                          DAG->getCurrentBottom());
+      } else {
+        // Update the instruction stream.
+        DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
+
+        // Update LiveIntervals.
+        // Note: Moving all instructions and calling handleMove everytime
+        // is the most cpu intensive operation of the scheduler.
+        // It would gain a lot if there was a way to recompute the
+        // LiveIntervals for the entire scheduling region.
+        DAG->getLIS()->handleMove(MI, /*UpdateFlags=*/true);
+        PosNew.push_back(CurrentTopFastSched);
+      }
+    }
+  }
+
+  // Now we have Block of SUs == Block of MI.
+  // We do the final schedule for the instructions inside the block.
+  // The property that all the SUs of the Block are grouped together as MI
+  // is used for correct reg usage tracking.
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    std::vector<SUnit*> SUs = Block->getScheduledUnits();
+    Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
+  }
+
+  DEBUG(dbgs() << "Restoring MI Pos\n");
+  // Restore old ordering (which prevents a LIS->handleMove bug).
+  for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
+    MachineBasicBlock::iterator POld = PosOld[i-1];
+    MachineBasicBlock::iterator PNew = PosNew[i-1];
+    if (PNew != POld) {
+      // Update the instruction stream.
+      DAG->getBB()->splice(POld, DAG->getBB(), PNew);
+
+      // Update LiveIntervals.
+      DAG->getLIS()->handleMove(POld, /*UpdateFlags=*/true);
+    }
+  }
+
+  DEBUG(
+    for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+      SIScheduleBlock *Block = CurrentBlocks[i];
+      Block->printDebug(true);
+    }
+  );
+}
+
+void SIScheduleBlockCreator::fillStats() {
+  unsigned DAGSize = CurrentBlocks.size();
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    int BlockIndice = TopDownIndex2Block[i];
+    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
+    if (Block->getPreds().size() == 0)
+      Block->Depth = 0;
+    else {
+      unsigned Depth = 0;
+      for (SIScheduleBlock *Pred : Block->getPreds()) {
+        if (Depth < Pred->Depth + 1)
+          Depth = Pred->Depth + 1;
+      }
+      Block->Depth = Depth;
+    }
+  }
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    int BlockIndice = BottomUpIndex2Block[i];
+    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
+    if (Block->getSuccs().size() == 0)
+      Block->Height = 0;
+    else {
+      unsigned Height = 0;
+      for (SIScheduleBlock *Succ : Block->getSuccs()) {
+        if (Height < Succ->Height + 1)
+          Height = Succ->Height + 1;
+      }
+      Block->Height = Height;
+    }
+  }
+}
+
+// SIScheduleBlockScheduler //
+
+SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
+                                                   SISchedulerBlockSchedulerVariant Variant,
+                                                   SIScheduleBlocks  BlocksStruct) :
+  DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
+  LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
+  SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
+
+  // Fill the usage of every output
+  // Warning: while by construction we always have a link between two blocks
+  // when one needs a result from the other, the number of users of an output
+  // is not the sum of child blocks having as input the same virtual register.
+  // Here is an example. A produces x and y. B eats x and produces x'.
+  // C eats x' and y. The register coalescer may have attributed the same
+  // virtual register to x and x'.
+  // To count accurately, we do a topological sort. In case the register is
+  // found for several parents, we increment the usage of the one with the
+  // highest topological index.
+  LiveOutRegsNumUsages.resize(Blocks.size());
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    for (unsigned Reg : Block->getInRegs()) {
+      bool Found = false;
+      int topoInd = -1;
+      for (SIScheduleBlock* Pred: Block->getPreds()) {
+        std::set<unsigned> PredOutRegs = Pred->getOutRegs();
+        std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
+
+        if (RegPos != PredOutRegs.end()) {
+          Found = true;
+          if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
+            topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
+          }
+        }
+      }
+
+      if (!Found)
+        continue;
+
+      int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
+      std::map<unsigned, unsigned>::iterator RegPos =
+        LiveOutRegsNumUsages[PredID].find(Reg);
+      if (RegPos != LiveOutRegsNumUsages[PredID].end()) {
+        ++LiveOutRegsNumUsages[PredID][Reg];
+      } else {
+        LiveOutRegsNumUsages[PredID][Reg] = 1;
+      }
+    }
+  }
+
+  LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
+  BlockNumPredsLeft.resize(Blocks.size());
+  BlockNumSuccsLeft.resize(Blocks.size());
+
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    BlockNumPredsLeft[i] = Block->getPreds().size();
+    BlockNumSuccsLeft[i] = Block->getSuccs().size();
+  }
+
+#ifndef NDEBUG
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    assert(Block->getID() == i);
+  }
+#endif
+
+  std::set<unsigned> InRegs = DAG->getInRegs();
+  addLiveRegs(InRegs);
+
+  // Fill LiveRegsConsumers for regs that were already
+  // defined before scheduling.
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    for (unsigned Reg : Block->getInRegs()) {
+      bool Found = false;
+      for (SIScheduleBlock* Pred: Block->getPreds()) {
+        std::set<unsigned> PredOutRegs = Pred->getOutRegs();
+        std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
+
+        if (RegPos != PredOutRegs.end()) {
+          Found = true;
+          break;
+        }
+      }
+
+      if (!Found) {
+        if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end())
+          LiveRegsConsumers[Reg] = 1;
+        else
+          ++LiveRegsConsumers[Reg];
+      }
+    }
+  }
+
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    if (BlockNumPredsLeft[i] == 0) {
+      ReadyBlocks.push_back(Block);
+    }
+  }
+
+  while (SIScheduleBlock *Block = pickBlock()) {
+    BlocksScheduled.push_back(Block);
+    blockScheduled(Block);
+  }
+
+  DEBUG(
+    dbgs() << "Block Order:";
+    for (SIScheduleBlock* Block : BlocksScheduled) {
+      dbgs() << ' ' << Block->getID();
+    }
+  );
+}
+
+bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
+                                                   SIBlockSchedCandidate &TryCand) {
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return true;
+  }
+
+  // Try to hide high latencies.
+  if (tryLess(TryCand.LastPosHighLatParentScheduled,
+              Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
+    return true;
+  // Schedule high latencies early so you can hide them better.
+  if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
+                 TryCand, Cand, Latency))
+    return true;
+  if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height,
+                                          TryCand, Cand, Depth))
+    return true;
+  if (tryGreater(TryCand.NumHighLatencySuccessors,
+                 Cand.NumHighLatencySuccessors,
+                 TryCand, Cand, Successor))
+    return true;
+  return false;
+}
+
+bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
+                                                    SIBlockSchedCandidate &TryCand) {
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return true;
+  }
+
+  if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
+              TryCand, Cand, RegUsage))
+    return true;
+  if (tryGreater(TryCand.NumSuccessors > 0,
+                 Cand.NumSuccessors > 0,
+                 TryCand, Cand, Successor))
+    return true;
+  if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
+    return true;
+  if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
+              TryCand, Cand, RegUsage))
+    return true;
+  return false;
+}
+
+SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
+  SIBlockSchedCandidate Cand;
+  std::vector<SIScheduleBlock*>::iterator Best;
+  SIScheduleBlock *Block;
+  if (ReadyBlocks.empty())
+    return nullptr;
+
+  DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
+                        VregCurrentUsage, SregCurrentUsage);
+  if (VregCurrentUsage > maxVregUsage)
+    maxVregUsage = VregCurrentUsage;
+  if (VregCurrentUsage > maxSregUsage)
+    maxSregUsage = VregCurrentUsage;
+  DEBUG(
+    dbgs() << "Picking New Blocks\n";
+    dbgs() << "Available: ";
+    for (SIScheduleBlock* Block : ReadyBlocks)
+      dbgs() << Block->getID() << ' ';
+    dbgs() << "\nCurrent Live:\n";
+    for (unsigned Reg : LiveRegs)
+      dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+    dbgs() << '\n';
+    dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
+    dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
+  );
+
+  Cand.Block = nullptr;
+  for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
+       E = ReadyBlocks.end(); I != E; ++I) {
+    SIBlockSchedCandidate TryCand;
+    TryCand.Block = *I;
+    TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
+    TryCand.VGPRUsageDiff =
+      checkRegUsageImpact(TryCand.Block->getInRegs(),
+                          TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
+    TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
+    TryCand.NumHighLatencySuccessors =
+      TryCand.Block->getNumHighLatencySuccessors();
+    TryCand.LastPosHighLatParentScheduled =
+      (unsigned int) std::max<int> (0,
+         LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
+           LastPosWaitedHighLatency);
+    TryCand.Height = TryCand.Block->Height;
+    // Try not to increase VGPR usage too much, else we may spill.
+    if (VregCurrentUsage > 120 ||
+        Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
+      if (!tryCandidateRegUsage(Cand, TryCand) &&
+          Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
+        tryCandidateLatency(Cand, TryCand);
+    } else {
+      if (!tryCandidateLatency(Cand, TryCand))
+        tryCandidateRegUsage(Cand, TryCand);
+    }
+    if (TryCand.Reason != NoCand) {
+      Cand.setBest(TryCand);
+      Best = I;
+      DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
+                   << getReasonStr(Cand.Reason) << '\n');
+    }
+  }
+
+  DEBUG(
+    dbgs() << "Picking: " << Cand.Block->getID() << '\n';
+    dbgs() << "Is a block with high latency instruction: "
+      << (Cand.IsHighLatency ? "yes\n" : "no\n");
+    dbgs() << "Position of last high latency dependency: "
+           << Cand.LastPosHighLatParentScheduled << '\n';
+    dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
+    dbgs() << '\n';
+  );
+
+  Block = Cand.Block;
+  ReadyBlocks.erase(Best);
+  return Block;
+}
+
+// Tracking of currently alive registers to determine VGPR Usage.
+
+void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
+  for (unsigned Reg : Regs) {
+    // For now only track virtual registers.
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    // If not already in the live set, then add it.
+    (void) LiveRegs.insert(Reg);
+  }
+}
+
+void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
+                                       std::set<unsigned> &Regs) {
+  for (unsigned Reg : Regs) {
+    // For now only track virtual registers.
+    std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
+    assert (Pos != LiveRegs.end() && // Reg must be live.
+               LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
+               LiveRegsConsumers[Reg] >= 1);
+    --LiveRegsConsumers[Reg];
+    if (LiveRegsConsumers[Reg] == 0)
+      LiveRegs.erase(Pos);
+  }
+}
+
+void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
+  for (SIScheduleBlock* Block : Parent->getSuccs()) {
+    --BlockNumPredsLeft[Block->getID()];
+    if (BlockNumPredsLeft[Block->getID()] == 0) {
+      ReadyBlocks.push_back(Block);
+    }
+    // TODO: Improve check. When the dependency between the high latency
+    // instructions and the instructions of the other blocks are WAR or WAW
+    // there will be no wait triggered. We would like these cases to not
+    // update LastPosHighLatencyParentScheduled.
+    if (Parent->isHighLatencyBlock())
+      LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled;
+  }
+}
+
+void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
+  decreaseLiveRegs(Block, Block->getInRegs());
+  addLiveRegs(Block->getOutRegs());
+  releaseBlockSuccs(Block);
+  for (std::map<unsigned, unsigned>::iterator RegI =
+       LiveOutRegsNumUsages[Block->getID()].begin(),
+       E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
+    std::pair<unsigned, unsigned> RegP = *RegI;
+    if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end())
+      LiveRegsConsumers[RegP.first] = RegP.second;
+    else {
+      assert(LiveRegsConsumers[RegP.first] == 0);
+      LiveRegsConsumers[RegP.first] += RegP.second;
+    }
+  }
+  if (LastPosHighLatencyParentScheduled[Block->getID()] >
+        (unsigned)LastPosWaitedHighLatency)
+    LastPosWaitedHighLatency =
+      LastPosHighLatencyParentScheduled[Block->getID()];
+  ++NumBlockScheduled;
+}
+
+std::vector<int>
+SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
+                                     std::set<unsigned> &OutRegs) {
+  std::vector<int> DiffSetPressure;
+  DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
+
+  for (unsigned Reg : InRegs) {
+    // For now only track virtual registers.
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    if (LiveRegsConsumers[Reg] > 1)
+      continue;
+    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+    for (; PSetI.isValid(); ++PSetI) {
+      DiffSetPressure[*PSetI] -= PSetI.getWeight();
+    }
+  }
+
+  for (unsigned Reg : OutRegs) {
+    // For now only track virtual registers.
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+    for (; PSetI.isValid(); ++PSetI) {
+      DiffSetPressure[*PSetI] += PSetI.getWeight();
+    }
+  }
+
+  return DiffSetPressure;
+}
+
+// SIScheduler //
+
+struct SIScheduleBlockResult
+SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
+                             SISchedulerBlockSchedulerVariant ScheduleVariant) {
+  SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
+  SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
+  std::vector<SIScheduleBlock*> ScheduledBlocks;
+  struct SIScheduleBlockResult Res;
+
+  ScheduledBlocks = Scheduler.getBlocks();
+
+  for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
+    SIScheduleBlock *Block = ScheduledBlocks[b];
+    std::vector<SUnit*> SUs = Block->getScheduledUnits();
+
+    for (SUnit* SU : SUs)
+      Res.SUs.push_back(SU->NodeNum);
+  }
+
+  Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
+  Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
+  return Res;
+}
+
+// SIScheduleDAGMI //
+
+SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
+  ScheduleDAGMILive(C, make_unique<GenericScheduler>(C)) {
+  SITII = static_cast<const SIInstrInfo*>(TII);
+  SITRI = static_cast<const SIRegisterInfo*>(TRI);
+
+  VGPRSetID = SITRI->getVGPR32PressureSet();
+  SGPRSetID = SITRI->getSGPR32PressureSet();
+}
+
+SIScheduleDAGMI::~SIScheduleDAGMI() {
+}
+
+ScheduleDAGInstrs *llvm::createSIMachineScheduler(MachineSchedContext *C) {
+  return new SIScheduleDAGMI(C);
+}
+
+// Code adapted from scheduleDAG.cpp
+// Does a topological sort over the SUs.
+// Both TopDown and BottomUp
+void SIScheduleDAGMI::topologicalSort() {
+  std::vector<int> TopDownSU2Index;
+  unsigned DAGSize = SUnits.size();
+  std::vector<SUnit*> WorkList;
+
+  DEBUG(dbgs() << "Topological Sort\n");
+  WorkList.reserve(DAGSize);
+
+  TopDownIndex2SU.resize(DAGSize);
+  TopDownSU2Index.resize(DAGSize);
+  BottomUpIndex2SU.resize(DAGSize);
+
+  WorkList.push_back(&getExitSU());
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    int NodeNum = SU->NodeNum;
+    unsigned Degree = SU->Succs.size();
+    TopDownSU2Index[NodeNum] = Degree;
+    if (Degree == 0) {
+      assert(SU->Succs.empty() && "SUnit should have no successors");
+      WorkList.push_back(SU);
+    }
+  }
+
+  int Id = DAGSize;
+  while (!WorkList.empty()) {
+    SUnit *SU = WorkList.back();
+    WorkList.pop_back();
+    if (SU->NodeNum < DAGSize) {
+      TopDownSU2Index[SU->NodeNum] = --Id;
+      TopDownIndex2SU[Id] = SU->NodeNum;
+    }
+    for (SDep& Pred : SU->Preds) {
+      SUnit *SU = Pred.getSUnit();
+      if (SU->NodeNum < DAGSize && !--TopDownSU2Index[SU->NodeNum])
+        WorkList.push_back(SU);
+    }
+  }
+
+  BottomUpIndex2SU = std::vector<int>(TopDownIndex2SU.rbegin(),
+                                      TopDownIndex2SU.rend());
+
+#ifndef NDEBUG
+  // Check correctness of the ordering
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    for (SDep& Pred : SU->Preds) {
+      if (Pred.getSUnit()->NodeNum >= DAGSize)
+        continue;
+      assert(TopDownSU2Index[SU->NodeNum] >
+             TopDownSU2Index[Pred.getSUnit()->NodeNum] &&
+             "Wrong Top Down topological sorting");
+    }
+  }
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    for (SDep& Succ : SU->Succs) {
+      if (Succ.getSUnit()->NodeNum >= DAGSize)
+        continue;
+      assert(TopDownSU2Index[SU->NodeNum] <
+             TopDownSU2Index[Succ.getSUnit()->NodeNum] &&
+             "Wrong Bottom Up topological sorting");
+    }
+  }
+#endif
+}
+
+// Move low latencies further from their user without
+// increasing SGPR usage (in general)
+// This is to be replaced by a better pass that would
+// take into account SGPR usage (based on VGPR Usage
+// and the corresponding wavefront count), that would
+// try to merge groups of loads if it make sense, etc
+void SIScheduleDAGMI::moveLowLatencies() {
+   unsigned DAGSize = SUnits.size();
+   int LastLowLatencyUser = -1;
+   int LastLowLatencyPos = -1;
+
+   for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
+    SUnit *SU = &SUnits[ScheduledSUnits[i]];
+    bool IsLowLatencyUser = false;
+    unsigned MinPos = 0;
+
+    for (SDep& PredDep : SU->Preds) {
+      SUnit *Pred = PredDep.getSUnit();
+      if (SITII->isLowLatencyInstruction(Pred->getInstr())) {
+        IsLowLatencyUser = true;
+      }
+      if (Pred->NodeNum >= DAGSize)
+        continue;
+      unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
+      if (PredPos >= MinPos)
+        MinPos = PredPos + 1;
+    }
+
+    if (SITII->isLowLatencyInstruction(SU->getInstr())) {
+      unsigned BestPos = LastLowLatencyUser + 1;
+      if ((int)BestPos <= LastLowLatencyPos)
+        BestPos = LastLowLatencyPos + 1;
+      if (BestPos < MinPos)
+        BestPos = MinPos;
+      if (BestPos < i) {
+        for (unsigned u = i; u > BestPos; --u) {
+          ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
+          ScheduledSUnits[u] = ScheduledSUnits[u-1];
+        }
+        ScheduledSUnits[BestPos] = SU->NodeNum;
+        ScheduledSUnitsInv[SU->NodeNum] = BestPos;
+      }
+      LastLowLatencyPos = BestPos;
+      if (IsLowLatencyUser)
+        LastLowLatencyUser = BestPos;
+    } else if (IsLowLatencyUser) {
+      LastLowLatencyUser = i;
+    // Moves COPY instructions on which depends
+    // the low latency instructions too.
+    } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
+      bool CopyForLowLat = false;
+      for (SDep& SuccDep : SU->Succs) {
+        SUnit *Succ = SuccDep.getSUnit();
+        if (SITII->isLowLatencyInstruction(Succ->getInstr())) {
+          CopyForLowLat = true;
+        }
+      }
+      if (!CopyForLowLat)
+        continue;
+      if (MinPos < i) {
+        for (unsigned u = i; u > MinPos; --u) {
+          ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
+          ScheduledSUnits[u] = ScheduledSUnits[u-1];
+        }
+        ScheduledSUnits[MinPos] = SU->NodeNum;
+        ScheduledSUnitsInv[SU->NodeNum] = MinPos;
+      }
+    }
+  }
+}
+
+void SIScheduleDAGMI::restoreSULinksLeft() {
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    SUnits[i].isScheduled = false;
+    SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
+    SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
+    SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
+    SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
+  }
+}
+
+// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
+template<typename _Iterator> void
+SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
+                                  unsigned &VgprUsage, unsigned &SgprUsage) {
+  VgprUsage = 0;
+  SgprUsage = 0;
+  for (_Iterator RegI = First; RegI != End; ++RegI) {
+    unsigned Reg = *RegI;
+    // For now only track virtual registers
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    PSetIterator PSetI = MRI.getPressureSets(Reg);
+    for (; PSetI.isValid(); ++PSetI) {
+      if (*PSetI == VGPRSetID)
+        VgprUsage += PSetI.getWeight();
+      else if (*PSetI == SGPRSetID)
+        SgprUsage += PSetI.getWeight();
+    }
+  }
+}
+
+void SIScheduleDAGMI::schedule()
+{
+  SmallVector<SUnit*, 8> TopRoots, BotRoots;
+  SIScheduleBlockResult Best, Temp;
+  DEBUG(dbgs() << "Preparing Scheduling\n");
+
+  buildDAGWithRegPressure();
+  DEBUG(
+    for(SUnit& SU : SUnits)
+       SU.dumpAll(this)
+  );
+
+  Topo.InitDAGTopologicalSorting();
+  topologicalSort();
+  findRootsAndBiasEdges(TopRoots, BotRoots);
+  // We reuse several ScheduleDAGMI and ScheduleDAGMILive
+  // functions, but to make them happy we must initialize
+  // the default Scheduler implementation (even if we do not
+  // run it)
+  SchedImpl->initialize(this);
+  initQueues(TopRoots, BotRoots);
+
+  // Fill some stats to help scheduling.
+
+  SUnitsLinksBackup = SUnits;
+  IsLowLatencySU.clear();
+  LowLatencyOffset.clear();
+  IsHighLatencySU.clear();
+
+  IsLowLatencySU.resize(SUnits.size(), 0);
+  LowLatencyOffset.resize(SUnits.size(), 0);
+  IsHighLatencySU.resize(SUnits.size(), 0);
+
+  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    unsigned BaseLatReg, OffLatReg;
+    if (SITII->isLowLatencyInstruction(SU->getInstr())) {
+      IsLowLatencySU[i] = 1;
+      if (SITII->getMemOpBaseRegImmOfs(SU->getInstr(), BaseLatReg,
+                                      OffLatReg, TRI))
+        LowLatencyOffset[i] = OffLatReg;
+    } else if (SITII->isHighLatencyInstruction(SU->getInstr()))
+      IsHighLatencySU[i] = 1;
+  }
+
+  SIScheduler Scheduler(this);
+  Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
+                                   SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
+#if 0 // To enable when handleMove fix lands
+  // if VGPR usage is extremely high, try other good performing variants
+  // which could lead to lower VGPR usage
+  if (Best.MaxVGPRUsage > 180) {
+    std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
+      { LatenciesAlone, BlockRegUsageLatency },
+//      { LatenciesAlone, BlockRegUsage },
+      { LatenciesGrouped, BlockLatencyRegUsage },
+//      { LatenciesGrouped, BlockRegUsageLatency },
+//      { LatenciesGrouped, BlockRegUsage },
+      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
+//      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
+//      { LatenciesAlonePlusConsecutive, BlockRegUsage }
+    };
+    for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
+      Temp = Scheduler.scheduleVariant(v.first, v.second);
+      if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
+        Best = Temp;
+    }
+  }
+  // if VGPR usage is still extremely high, we may spill. Try other variants
+  // which are less performing, but that could lead to lower VGPR usage.
+  if (Best.MaxVGPRUsage > 200) {
+    std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
+//      { LatenciesAlone, BlockRegUsageLatency },
+      { LatenciesAlone, BlockRegUsage },
+//      { LatenciesGrouped, BlockLatencyRegUsage },
+      { LatenciesGrouped, BlockRegUsageLatency },
+      { LatenciesGrouped, BlockRegUsage },
+//      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
+      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
+      { LatenciesAlonePlusConsecutive, BlockRegUsage }
+    };
+    for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
+      Temp = Scheduler.scheduleVariant(v.first, v.second);
+      if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
+        Best = Temp;
+    }
+  }
+#endif
+  ScheduledSUnits = Best.SUs;
+  ScheduledSUnitsInv.resize(SUnits.size());
+
+  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
+    ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
+  }
+
+  moveLowLatencies();
+
+  // Tell the outside world about the result of the scheduling.
+
+  assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
+  TopRPTracker.setPos(CurrentTop);
+
+  for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
+       E = ScheduledSUnits.end(); I != E; ++I) {
+    SUnit *SU = &SUnits[*I];
+
+    scheduleMI(SU, true);
+
+    DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
+                 << *SU->getInstr());
+  }
+
+  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+  placeDebugValues();
+
+  DEBUG({
+      unsigned BBNum = begin()->getParent()->getNumber();
+      dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
+      dumpSchedule();
+      dbgs() << '\n';
+    });
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
new file mode 100644
index 000000000000..b270136811c6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -0,0 +1,489 @@
+//===-- SIMachineScheduler.h - SI Scheduler Interface -*- C++ -*-------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
+#define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
+
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+enum SIScheduleCandReason {
+  NoCand,
+  RegUsage,
+  Latency,
+  Successor,
+  Depth,
+  NodeOrder
+};
+
+struct SISchedulerCandidate {
+  // The reason for this candidate.
+  SIScheduleCandReason Reason;
+
+  // Set of reasons that apply to multiple candidates.
+  uint32_t RepeatReasonSet;
+
+  SISchedulerCandidate()
+    :  Reason(NoCand), RepeatReasonSet(0) {}
+
+  bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); }
+  void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); }
+};
+
+class SIScheduleDAGMI;
+class SIScheduleBlockCreator;
+
+class SIScheduleBlock {
+  SIScheduleDAGMI *DAG;
+  SIScheduleBlockCreator *BC;
+
+  std::vector<SUnit*> SUnits;
+  std::map<unsigned, unsigned> NodeNum2Index;
+  std::vector<SUnit*> TopReadySUs;
+  std::vector<SUnit*> ScheduledSUnits;
+
+  /// The top of the unscheduled zone.
+  IntervalPressure TopPressure;
+  RegPressureTracker TopRPTracker;
+
+  // Pressure: number of said class of registers needed to
+  // store the live virtual and real registers.
+  // We do care only of SGPR32 and VGPR32 and do track only virtual registers.
+  // Pressure of additional registers required inside the block.
+  std::vector<unsigned> InternalAdditionnalPressure;
+  // Pressure of input and output registers
+  std::vector<unsigned> LiveInPressure;
+  std::vector<unsigned> LiveOutPressure;
+  // Registers required by the block, and outputs.
+  // We do track only virtual registers.
+  // Note that some registers are not 32 bits,
+  // and thus the pressure is not equal
+  // to the number of live registers.
+  std::set<unsigned> LiveInRegs;
+  std::set<unsigned> LiveOutRegs;
+
+  bool Scheduled;
+  bool HighLatencyBlock;
+
+  std::vector<unsigned> HasLowLatencyNonWaitedParent;
+
+  // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table.
+  unsigned ID;
+
+  std::vector<SIScheduleBlock*> Preds;  // All blocks predecessors.
+  std::vector<SIScheduleBlock*> Succs;  // All blocks successors.
+  unsigned NumHighLatencySuccessors;
+
+public:
+  SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC,
+                  unsigned ID):
+    DAG(DAG), BC(BC), SUnits(), TopReadySUs(), ScheduledSUnits(),
+    TopRPTracker(TopPressure), Scheduled(false),
+    HighLatencyBlock(false), ID(ID),
+    Preds(), Succs(), NumHighLatencySuccessors(0) {};
+
+  ~SIScheduleBlock() {};
+
+  unsigned getID() const { return ID; }
+
+  /// Functions for Block construction.
+  void addUnit(SUnit *SU);
+
+  // When all SUs have been added.
+  void finalizeUnits();
+
+  // Add block pred, which has instruction predecessor of SU.
+  void addPred(SIScheduleBlock *Pred);
+  void addSucc(SIScheduleBlock *Succ);
+
+  const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; }
+  const std::vector<SIScheduleBlock*>& getSuccs() const { return Succs; }
+
+  unsigned Height;  // Maximum topdown path length to block without outputs
+  unsigned Depth;   // Maximum bottomup path length to block without inputs
+
+  unsigned getNumHighLatencySuccessors() const {
+    return NumHighLatencySuccessors;
+  }
+
+  bool isHighLatencyBlock() { return HighLatencyBlock; }
+
+  // This is approximative.
+  // Ideally should take into accounts some instructions (rcp, etc)
+  // are 4 times slower.
+  int getCost() { return SUnits.size(); }
+
+  // The block Predecessors and Successors must be all registered
+  // before fastSchedule().
+  // Fast schedule with no particular requirement.
+  void fastSchedule();
+
+  std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; }
+
+  // Complete schedule that will try to minimize reg pressure and
+  // low latencies, and will fill liveins and liveouts.
+  // Needs all MIs to be grouped between BeginBlock and EndBlock.
+  // The MIs can be moved after the scheduling,
+  // it is just used to allow correct track of live registers.
+  void schedule(MachineBasicBlock::iterator BeginBlock,
+                MachineBasicBlock::iterator EndBlock);
+
+  bool isScheduled() { return Scheduled; }
+
+
+  // Needs the block to be scheduled inside
+  // TODO: find a way to compute it.
+  std::vector<unsigned> &getInternalAdditionnalRegUsage() {
+    return InternalAdditionnalPressure;
+  }
+
+  std::set<unsigned> &getInRegs() { return LiveInRegs; }
+  std::set<unsigned> &getOutRegs() { return LiveOutRegs; }
+
+  void printDebug(bool Full);
+
+private:
+  struct SISchedCandidate : SISchedulerCandidate {
+    // The best SUnit candidate.
+    SUnit *SU;
+
+    unsigned SGPRUsage;
+    unsigned VGPRUsage;
+    bool IsLowLatency;
+    unsigned LowLatencyOffset;
+    bool HasLowLatencyNonWaitedParent;
+
+    SISchedCandidate()
+      : SU(nullptr) {}
+
+    bool isValid() const { return SU; }
+
+    // Copy the status of another candidate without changing policy.
+    void setBest(SISchedCandidate &Best) {
+      assert(Best.Reason != NoCand && "uninitialized Sched candidate");
+      SU = Best.SU;
+      Reason = Best.Reason;
+      SGPRUsage = Best.SGPRUsage;
+      VGPRUsage = Best.VGPRUsage;
+      IsLowLatency = Best.IsLowLatency;
+      LowLatencyOffset = Best.LowLatencyOffset;
+      HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent;
+    }
+  };
+
+  void undoSchedule();
+
+  void undoReleaseSucc(SUnit *SU, SDep *SuccEdge);
+  void releaseSucc(SUnit *SU, SDep *SuccEdge);
+  // InOrOutBlock: restrict to links pointing inside the block (true),
+  // or restrict to links pointing outside the block (false).
+  void releaseSuccessors(SUnit *SU, bool InOrOutBlock);
+
+  void nodeScheduled(SUnit *SU);
+  void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand);
+  void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand);
+  SUnit* pickNode();
+  void traceCandidate(const SISchedCandidate &Cand);
+  void initRegPressure(MachineBasicBlock::iterator BeginBlock,
+                       MachineBasicBlock::iterator EndBlock);
+};
+
+struct SIScheduleBlocks {
+  std::vector<SIScheduleBlock*> Blocks;
+  std::vector<int> TopDownIndex2Block;
+  std::vector<int> TopDownBlock2Index;
+};
+
+enum SISchedulerBlockCreatorVariant {
+    LatenciesAlone,
+    LatenciesGrouped,
+    LatenciesAlonePlusConsecutive
+};
+
+class SIScheduleBlockCreator {
+  SIScheduleDAGMI *DAG;
+  // unique_ptr handles freeing memory for us.
+  std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs;
+  std::map<SISchedulerBlockCreatorVariant,
+           SIScheduleBlocks> Blocks;
+  std::vector<SIScheduleBlock*> CurrentBlocks;
+  std::vector<int> Node2CurrentBlock;
+
+  // Topological sort
+  // Maps topological index to the node number.
+  std::vector<int> TopDownIndex2Block;
+  std::vector<int> TopDownBlock2Index;
+  std::vector<int> BottomUpIndex2Block;
+
+  // 0 -> Color not given.
+  // 1 to SUnits.size() -> Reserved group (you should only add elements to them).
+  // Above -> Other groups.
+  int NextReservedID;
+  int NextNonReservedID;
+  std::vector<int> CurrentColoring;
+  std::vector<int> CurrentTopDownReservedDependencyColoring;
+  std::vector<int> CurrentBottomUpReservedDependencyColoring;
+
+public:
+  SIScheduleBlockCreator(SIScheduleDAGMI *DAG);
+  ~SIScheduleBlockCreator();
+
+  SIScheduleBlocks
+  getBlocks(SISchedulerBlockCreatorVariant BlockVariant);
+
+  bool isSUInBlock(SUnit *SU, unsigned ID);
+
+private:
+  // Give a Reserved color to every high latency.
+  void colorHighLatenciesAlone();
+
+  // Create groups of high latencies with a Reserved color.
+  void colorHighLatenciesGroups();
+
+  // Compute coloring for topdown and bottom traversals with
+  // different colors depending on dependencies on Reserved colors.
+  void colorComputeReservedDependencies();
+
+  // Give color to all non-colored SUs according to Reserved groups dependencies.
+  void colorAccordingToReservedDependencies();
+
+  // Divides Blocks having no bottom up or top down dependencies on Reserved groups.
+  // The new colors are computed according to the dependencies on the other blocks
+  // formed with colorAccordingToReservedDependencies.
+  void colorEndsAccordingToDependencies();
+
+  // Cut groups into groups with SUs in consecutive order (except for Reserved groups).
+  void colorForceConsecutiveOrderInGroup();
+
+  // Merge Constant loads that have all their users into another group to the group.
+  // (TODO: else if all their users depend on the same group, put them there)
+  void colorMergeConstantLoadsNextGroup();
+
+  // Merge SUs that have all their users into another group to the group
+  void colorMergeIfPossibleNextGroup();
+
+  // Merge SUs that have all their users into another group to the group,
+  // but only for Reserved groups.
+  void colorMergeIfPossibleNextGroupOnlyForReserved();
+
+  // Merge SUs that have all their users into another group to the group,
+  // but only if the group is no more than a few SUs.
+  void colorMergeIfPossibleSmallGroupsToNextGroup();
+
+  // Divides Blocks with important size.
+  // Idea of implementation: attribute new colors depending on topdown and
+  // bottom up links to other blocks.
+  void cutHugeBlocks();
+
+  // Put in one group all instructions with no users in this scheduling region
+  // (we'd want these groups be at the end).
+  void regroupNoUserInstructions();
+
+  void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant);
+
+  void topologicalSort();
+
+  void scheduleInsideBlocks();
+
+  void fillStats();
+};
+
+enum SISchedulerBlockSchedulerVariant {
+  BlockLatencyRegUsage,
+  BlockRegUsageLatency,
+  BlockRegUsage
+};
+
+class SIScheduleBlockScheduler {
+  SIScheduleDAGMI *DAG;
+  SISchedulerBlockSchedulerVariant Variant;
+  std::vector<SIScheduleBlock*> Blocks;
+
+  std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages;
+  std::set<unsigned> LiveRegs;
+  // Num of schedulable unscheduled blocks reading the register.
+  std::map<unsigned, unsigned> LiveRegsConsumers;
+
+  std::vector<unsigned> LastPosHighLatencyParentScheduled;
+  int LastPosWaitedHighLatency;
+
+  std::vector<SIScheduleBlock*> BlocksScheduled;
+  unsigned NumBlockScheduled;
+  std::vector<SIScheduleBlock*> ReadyBlocks;
+
+  unsigned VregCurrentUsage;
+  unsigned SregCurrentUsage;
+
+  // Currently is only approximation.
+  unsigned maxVregUsage;
+  unsigned maxSregUsage;
+
+  std::vector<unsigned> BlockNumPredsLeft;
+  std::vector<unsigned> BlockNumSuccsLeft;
+
+public:
+  SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
+                           SISchedulerBlockSchedulerVariant Variant,
+                           SIScheduleBlocks BlocksStruct);
+  ~SIScheduleBlockScheduler() {};
+
+  std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; };
+
+  unsigned getVGPRUsage() { return maxVregUsage; };
+  unsigned getSGPRUsage() { return maxSregUsage; };
+
+private:
+  struct SIBlockSchedCandidate : SISchedulerCandidate {
+    // The best Block candidate.
+    SIScheduleBlock *Block;
+
+    bool IsHighLatency;
+    int VGPRUsageDiff;
+    unsigned NumSuccessors;
+    unsigned NumHighLatencySuccessors;
+    unsigned LastPosHighLatParentScheduled;
+    unsigned Height;
+
+    SIBlockSchedCandidate()
+      : Block(nullptr) {}
+
+    bool isValid() const { return Block; }
+
+    // Copy the status of another candidate without changing policy.
+    void setBest(SIBlockSchedCandidate &Best) {
+      assert(Best.Reason != NoCand && "uninitialized Sched candidate");
+      Block = Best.Block;
+      Reason = Best.Reason;
+      IsHighLatency = Best.IsHighLatency;
+      VGPRUsageDiff = Best.VGPRUsageDiff;
+      NumSuccessors = Best.NumSuccessors;
+      NumHighLatencySuccessors = Best.NumHighLatencySuccessors;
+      LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled;
+      Height = Best.Height;
+    }
+  };
+
+  bool tryCandidateLatency(SIBlockSchedCandidate &Cand,
+                           SIBlockSchedCandidate &TryCand);
+  bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
+                            SIBlockSchedCandidate &TryCand);
+  SIScheduleBlock *pickBlock();
+
+  void addLiveRegs(std::set<unsigned> &Regs);
+  void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs);
+  void releaseBlockSuccs(SIScheduleBlock *Parent);
+  void blockScheduled(SIScheduleBlock *Block);
+
+  // Check register pressure change
+  // by scheduling a block with these LiveIn and LiveOut.
+  std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs,
+                                       std::set<unsigned> &OutRegs);
+
+  void schedule();
+};
+
+struct SIScheduleBlockResult {
+  std::vector<unsigned> SUs;
+  unsigned MaxSGPRUsage;
+  unsigned MaxVGPRUsage;
+};
+
+class SIScheduler {
+  SIScheduleDAGMI *DAG;
+  SIScheduleBlockCreator BlockCreator;
+
+public:
+  SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {};
+
+  ~SIScheduler() {};
+
+  struct SIScheduleBlockResult
+  scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
+                  SISchedulerBlockSchedulerVariant ScheduleVariant);
+};
+
+class SIScheduleDAGMI : public ScheduleDAGMILive {
+  const SIInstrInfo *SITII;
+  const SIRegisterInfo *SITRI;
+
+  std::vector<SUnit> SUnitsLinksBackup;
+
+  // For moveLowLatencies. After all Scheduling variants are tested.
+  std::vector<unsigned> ScheduledSUnits;
+  std::vector<unsigned> ScheduledSUnitsInv;
+
+  unsigned VGPRSetID;
+  unsigned SGPRSetID;
+
+public:
+  SIScheduleDAGMI(MachineSchedContext *C);
+
+  ~SIScheduleDAGMI() override;
+
+  // Entry point for the schedule.
+  void schedule() override;
+
+  // To init Block's RPTracker.
+  void initRPTracker(RegPressureTracker &RPTracker) {
+    RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin);
+  }
+
+  MachineBasicBlock *getBB() { return BB; }
+  MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; };
+  MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; };
+  LiveIntervals *getLIS() { return LIS; }
+  MachineRegisterInfo *getMRI() { return &MRI; }
+  const TargetRegisterInfo *getTRI() { return TRI; }
+  SUnit& getEntrySU() { return EntrySU; };
+  SUnit& getExitSU() { return ExitSU; };
+
+  void restoreSULinksLeft();
+
+  template<typename _Iterator> void fillVgprSgprCost(_Iterator First,
+                                                     _Iterator End,
+                                                     unsigned &VgprUsage,
+                                                     unsigned &SgprUsage);
+  std::set<unsigned> getInRegs() {
+    std::set<unsigned> InRegs (RPTracker.getPressure().LiveInRegs.begin(),
+                               RPTracker.getPressure().LiveInRegs.end());
+    return InRegs;
+  };
+
+  unsigned getVGPRSetID() const { return VGPRSetID; }
+  unsigned getSGPRSetID() const { return SGPRSetID; }
+
+private:
+  void topologicalSort();
+  // After scheduling is done, improve low latency placements.
+  void moveLowLatencies();
+
+public:
+  // Some stats for scheduling inside blocks.
+  std::vector<unsigned> IsLowLatencySU;
+  std::vector<unsigned> LowLatencyOffset;
+  std::vector<unsigned> IsHighLatencySU;
+  // Topological sort
+  // Maps topological index to the node number.
+  std::vector<int> TopDownIndex2SU;
+  std::vector<int> BottomUpIndex2SU;
+};
+
+} // namespace llvm
+
+#endif /* SIMACHINESCHEDULER_H_ */
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 2afa00996609..609f5e7df549 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -23,7 +23,20 @@
 
 using namespace llvm;
 
-SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {}
+SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {
+  unsigned NumRegPressureSets = getNumRegPressureSets();
+
+  SGPR32SetID = NumRegPressureSets;
+  VGPR32SetID = NumRegPressureSets;
+  for (unsigned i = 0; i < NumRegPressureSets; ++i) {
+    if (strncmp("SGPR_32", getRegPressureSetName(i), 7) == 0)
+      SGPR32SetID = i;
+    else if (strncmp("VGPR_32", getRegPressureSetName(i), 7) == 0)
+      VGPR32SetID = i;
+  }
+  assert(SGPR32SetID < NumRegPressureSets &&
+         VGPR32SetID < NumRegPressureSets);
+}
 
 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
   MCRegAliasIterator R(Reg, this, true);
@@ -36,18 +49,15 @@ unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
   const MachineFunction &MF) const {
   const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
   if (ST.hasSGPRInitBug()) {
-    unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 4;
-    if (ST.isXNACKEnabled())
-      BaseIdx -= 4;
-
+    // Leave space for flat_scr, xnack_mask, vcc, and alignment
+    unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 8 - 4;
     unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
     return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
   }
 
   if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    // 98/99 need to be reserved for flat_scr or 96/97 for flat_scr and
-    // 98/99 for xnack_mask, and 100/101 for vcc. This is the next sgpr128 down
-    // either way.
+    // 96/97 need to be reserved for flat_scr, 98/99 for xnack_mask, and
+    // 100/101 for vcc. This is the next sgpr128 down.
     return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95;
   }
 
@@ -58,25 +68,14 @@ unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
   const MachineFunction &MF) const {
   const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
   if (ST.hasSGPRInitBug()) {
-    unsigned Idx;
-
-    if (!ST.isXNACKEnabled())
-      Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 5;
-    else
-      Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1;
-
+    unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1;
     return AMDGPU::SGPR_32RegClass.getRegister(Idx);
   }
 
   if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    if (!ST.isXNACKEnabled()) {
-      // Next register before reservations for flat_scr and vcc.
-      return AMDGPU::SGPR97;
-    } else {
-      // Next register before reservations for flat_scr, xnack_mask, vcc,
-      // and scratch resource.
-      return AMDGPU::SGPR91;
-    }
+    // Next register before reservations for flat_scr, xnack_mask, vcc,
+    // and scratch resource.
+    return AMDGPU::SGPR91;
   }
 
   return AMDGPU::SGPR95;
@@ -99,23 +98,22 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
   if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
     // SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation
-    // for VCC/FLAT_SCR.
+    // for VCC/XNACK_MASK/FLAT_SCR.
+    //
+    // TODO The SGPRs that alias to XNACK_MASK could be used as general purpose
+    // SGPRs when the XNACK feature is not used. This is currently not done
+    // because the code that counts SGPRs cannot account for such holes.
+    reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97);
     reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99);
     reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101);
-
-    if (ST.isXNACKEnabled())
-      reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97);
   }
 
   // Tonga and Iceland can only allocate a fixed number of SGPRs due
   // to a hw bug.
   if (ST.hasSGPRInitBug()) {
     unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
-    // Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs).
-    unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4;
-
-    if (ST.isXNACKEnabled())
-      Limit -= 2;
+    // Reserve some SGPRs for FLAT_SCRATCH, XNACK_MASK, and VCC (6 SGPRs).
+    unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6;
 
     for (unsigned i = Limit; i < NumSGPRs; ++i) {
       unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
@@ -479,12 +477,38 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
   if (SubIdx == AMDGPU::NoSubRegister)
     return RC;
 
-  // If this register has a sub-register, we can safely assume it is a 32-bit
-  // register, because all of SI's sub-registers are 32-bit.
+  // We can assume that each lane corresponds to one 32-bit register.
+  unsigned Count = countPopulation(getSubRegIndexLaneMask(SubIdx));
   if (isSGPRClass(RC)) {
-    return &AMDGPU::SGPR_32RegClass;
+    switch (Count) {
+    case 1:
+      return &AMDGPU::SGPR_32RegClass;
+    case 2:
+      return &AMDGPU::SReg_64RegClass;
+    case 4:
+      return &AMDGPU::SReg_128RegClass;
+    case 8:
+      return &AMDGPU::SReg_256RegClass;
+    case 16: /* fall-through */
+    default:
+      llvm_unreachable("Invalid sub-register class size");
+    }
   } else {
-    return &AMDGPU::VGPR_32RegClass;
+    switch (Count) {
+    case 1:
+      return &AMDGPU::VGPR_32RegClass;
+    case 2:
+      return &AMDGPU::VReg_64RegClass;
+    case 3:
+      return &AMDGPU::VReg_96RegClass;
+    case 4:
+      return &AMDGPU::VReg_128RegClass;
+    case 8:
+      return &AMDGPU::VReg_256RegClass;
+    case 16: /* fall-through */
+    default:
+      llvm_unreachable("Invalid sub-register class size");
+    }
   }
 }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 1795237c2140..9410e2049cba 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -25,6 +25,9 @@ namespace llvm {
 
 struct SIRegisterInfo : public AMDGPURegisterInfo {
 private:
+  unsigned SGPR32SetID;
+  unsigned VGPR32SetID;
+
   void reserveRegisterTuples(BitVector &, unsigned Reg) const;
 
 public:
@@ -146,6 +149,9 @@ public:
   unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
                               const TargetRegisterClass *RC) const;
 
+  unsigned getSGPR32PressureSet() const { return SGPR32SetID; };
+  unsigned getVGPR32PressureSet() const { return VGPR32SetID; };
+
 private:
   void buildScratchLoadStore(MachineBasicBlock::iterator MI,
                              unsigned LoadStoreOp, unsigned Value,
diff --git a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp
index dbdc76b917f3..d36c5d29b127 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp
@@ -98,6 +98,9 @@ void SITypeRewriter::visitCallInst(CallInst &I) {
   SmallVector <Type*, 8> Types;
   bool NeedToReplace = false;
   Function *F = I.getCalledFunction();
+  if (!F)
+    return;
+
   std::string Name = F->getName();
   for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
     Value *Arg = I.getArgOperand(i);
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index add415e215cf..3b4c235c0dc9 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -106,20 +106,27 @@ bool isReadOnlySegment(const GlobalValue *GV) {
   return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
 }
 
-static const char ShaderTypeAttribute[] = "ShaderType";
-
-unsigned getShaderType(const Function &F) {
-  Attribute A = F.getFnAttribute(ShaderTypeAttribute);
-  unsigned ShaderType = ShaderType::COMPUTE;
+static unsigned getIntegerAttribute(const Function &F, const char *Name,
+                                    unsigned Default) {
+  Attribute A = F.getFnAttribute(Name);
+  unsigned Result = Default;
 
   if (A.isStringAttribute()) {
     StringRef Str = A.getValueAsString();
-    if (Str.getAsInteger(0, ShaderType)) {
+    if (Str.getAsInteger(0, Result)) {
       LLVMContext &Ctx = F.getContext();
       Ctx.emitError("can't parse shader type");
     }
   }
-  return ShaderType;
+  return Result;
+}
+
+unsigned getShaderType(const Function &F) {
+  return getIntegerAttribute(F, "ShaderType", ShaderType::COMPUTE);
+}
+
+unsigned getInitialPSInputAddr(const Function &F) {
+  return getIntegerAttribute(F, "InitialPSInputAddr", 0);
 }
 
 bool isSI(const MCSubtargetInfo &STI) {
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 19419a29f5e0..57cbe1b58f98 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -45,6 +45,8 @@ bool isGlobalSegment(const GlobalValue *GV);
 bool isReadOnlySegment(const GlobalValue *GV);
 
 unsigned getShaderType(const Function &F);
+unsigned getInitialPSInputAddr(const Function &F);
+
 
 bool isSI(const MCSubtargetInfo &STI);
 bool isCI(const MCSubtargetInfo &STI);
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 419717c85a79..a5207705fc69 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -87,9 +87,22 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     }
   }
 
+  if (STI.isTargetDarwin() && F->getCallingConv() == CallingConv::CXX_FAST_TLS)
+    return MF->getInfo<ARMFunctionInfo>()->isSplitCSR()
+               ? CSR_iOS_CXX_TLS_PE_SaveList
+               : CSR_iOS_CXX_TLS_SaveList;
   return RegList;
 }
 
+const MCPhysReg *ARMBaseRegisterInfo::getCalleeSavedRegsViaCopy(
+    const MachineFunction *MF) const {
+  assert(MF && "Invalid MachineFunction pointer.");
+  if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+      MF->getInfo<ARMFunctionInfo>()->isSplitCSR())
+    return CSR_iOS_CXX_TLS_ViaCopy_SaveList;
+  return nullptr;
+}
+
 const uint32_t *
 ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                           CallingConv::ID CC) const {
@@ -97,6 +110,8 @@ ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
   if (CC == CallingConv::GHC)
     // This is academic becase all GHC calls are (supposed to be) tail calls
     return CSR_NoRegs_RegMask;
+  if (STI.isTargetDarwin() && CC == CallingConv::CXX_FAST_TLS)
+    return CSR_iOS_CXX_TLS_RegMask;
   return STI.isTargetDarwin() ? CSR_iOS_RegMask : CSR_AAPCS_RegMask;
 }
 
@@ -106,6 +121,14 @@ ARMBaseRegisterInfo::getNoPreservedMask() const {
 }
 
 const uint32_t *
+ARMBaseRegisterInfo::getTLSCallPreservedMask(const MachineFunction &MF) const {
+  assert(MF.getSubtarget<ARMSubtarget>().isTargetDarwin() &&
+         "only know about special TLS call on Darwin");
+  return CSR_iOS_TLSCall_RegMask;
+}
+
+
+const uint32_t *
 ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
                                                 CallingConv::ID CC) const {
   const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
index cea8b80c7821..6a9a45a65687 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -62,6 +62,12 @@ static inline bool isARMArea3Register(unsigned Reg, bool isIOS) {
   switch (Reg) {
     case D15: case D14: case D13: case D12:
     case D11: case D10: case D9:  case D8:
+    case D7:  case D6:  case D5:  case D4:
+    case D3:  case D2:  case D1:  case D0:
+    case D31: case D30: case D29: case D28:
+    case D27: case D26: case D25: case D24:
+    case D23: case D22: case D21: case D20:
+    case D19: case D18: case D17: case D16:
       return true;
     default:
       return false;
@@ -92,9 +98,12 @@ protected:
 public:
   /// Code Generation virtual methods...
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+  const MCPhysReg *
+  getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override;
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID) const override;
   const uint32_t *getNoPreservedMask() const override;
+  const uint32_t *getTLSCallPreservedMask(const MachineFunction &MF) const;
 
   /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
   /// case that 'returned' is on an i32 first argument if the calling convention
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td
index 233516415149..847ef87c1b26 100644
--- a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td
+++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td
@@ -225,6 +225,21 @@ def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>;
 def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
                                          (sub CSR_AAPCS_ThisReturn, R9))>;
 
+def CSR_iOS_TLSCall : CalleeSavedRegs<(add LR, SP,
+                                           (sequence "R%u", 12, 1),
+                                           (sequence "D%u", 31, 0))>;
+
+// C++ TLS access function saves all registers except SP. Try to match
+// the order of CSRs in CSR_iOS.
+def CSR_iOS_CXX_TLS : CalleeSavedRegs<(add CSR_iOS, (sequence "R%u", 12, 1),
+                                           (sequence "D%u", 31, 0))>;
+
+// CSRs that are handled by prologue, epilogue.
+def CSR_iOS_CXX_TLS_PE : CalleeSavedRegs<(add LR)>;
+
+// CSRs that are handled explicitly via copies.
+def CSR_iOS_CXX_TLS_ViaCopy : CalleeSavedRegs<(sub CSR_iOS_CXX_TLS, LR)>;
+
 // The "interrupt" attribute is used to generate code that is acceptable in
 // exception-handlers of various kinds. It makes us use a different return
 // instruction (handled elsewhere) and affects which registers we must return to
diff --git a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
index 9bdf823c85bd..ff2fcfa349dc 100644
--- a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -578,7 +578,7 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) {
 
 unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
   // For now 32-bit only.
-  if (VT != MVT::i32) return 0;
+  if (VT != MVT::i32 || GV->isThreadLocal()) return 0;
 
   Reloc::Model RelocM = TM.getRelocationModel();
   bool IsIndirect = Subtarget->GVIsIndirectSymbol(GV, RelocM);
@@ -2083,6 +2083,9 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
   if (!FuncInfo.CanLowerReturn)
     return false;
 
+  if (TLI.supportSplitCSR(FuncInfo.MF))
+    return false;
+
   // Build a list of return value registers.
   SmallVector<unsigned, 4> RetRegs;
 
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 024244092a34..dfbb96959470 100644
--- a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -622,7 +622,8 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
     }
 
     if (N.getOpcode() == ARMISD::Wrapper &&
-        N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
     } else
       Base = N;
@@ -801,7 +802,8 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
       Base = CurDAG->getTargetFrameIndex(
           FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     } else if (N.getOpcode() == ARMISD::Wrapper &&
-               N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
+               N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+               N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
     }
     Offset = CurDAG->getRegister(0, MVT::i32);
@@ -1067,7 +1069,8 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
       Base = CurDAG->getTargetFrameIndex(
           FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     } else if (N.getOpcode() == ARMISD::Wrapper &&
-               N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
+               N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+               N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
     }
     Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
@@ -1186,7 +1189,8 @@ ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale,
     if (N.getOpcode() == ISD::ADD) {
       return false; // We want to select register offset instead
     } else if (N.getOpcode() == ARMISD::Wrapper &&
-               N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
     } else {
       Base = N;
@@ -1292,7 +1296,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
     }
 
     if (N.getOpcode() == ARMISD::Wrapper &&
-        N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::TargetConstantPool)
         return false;  // We want to select t2LDRpci instead.
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 9cfb06b00c4b..37c0795af283 100644
--- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -744,7 +744,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SUBE,    MVT::i32, Custom);
   }
 
-  if (!Subtarget->isThumb1Only())
+  if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
     setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
 
   // ARM does not have ROTL.
@@ -1385,6 +1385,7 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
     else
       return CallingConv::ARM_AAPCS;
   case CallingConv::Fast:
+  case CallingConv::CXX_FAST_TLS:
     if (!Subtarget->isAAPCS_ABI()) {
       if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
         return CallingConv::Fast;
@@ -2347,6 +2348,19 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
     Flag = Chain.getValue(1);
     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
+  const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const MCPhysReg *I =
+      TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+  if (I) {
+    for (; *I; ++I) {
+      if (ARM::GPRRegClass.contains(*I))
+        RetOps.push_back(DAG.getRegister(*I, MVT::i32));
+      else if (ARM::DPRRegClass.contains(*I))
+        RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
+      else
+        llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+    }
+  }
 
   // Update chain and glue.
   RetOps[0] = Chain;
@@ -2530,6 +2544,72 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
   return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
 }
 
+/// \brief Convert a TLS address reference into the correct sequence of loads
+/// and calls to compute the variable's address for Darwin, and return an
+/// SDValue containing the final node.
+
+/// Darwin only has one TLS scheme which must be capable of dealing with the
+/// fully general situation, in the worst case. This means:
+///     + "extern __thread" declaration.
+///     + Defined in a possibly unknown dynamic library.
+///
+/// The general system is that each __thread variable has a [3 x i32] descriptor
+/// which contains information used by the runtime to calculate the address. The
+/// only part of this the compiler needs to know about is the first word, which
+/// contains a function pointer that must be called with the address of the
+/// entire descriptor in "r0".
+///
+/// Since this descriptor may be in a different unit, in general access must
+/// proceed along the usual ARM rules. A common sequence to produce is:
+///
+///     movw rT1, :lower16:_var$non_lazy_ptr
+///     movt rT1, :upper16:_var$non_lazy_ptr
+///     ldr r0, [rT1]
+///     ldr rT2, [r0]
+///     blx rT2
+///     [...address now in r0...]
+SDValue
+ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
+  SDLoc DL(Op);
+
+  // First step is to get the address of the actua global symbol. This is where
+  // the TLS descriptor lives.
+  SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
+
+  // The first entry in the descriptor is a function pointer that we must call
+  // to obtain the address of the variable.
+  SDValue Chain = DAG.getEntryNode();
+  SDValue FuncTLVGet =
+      DAG.getLoad(MVT::i32, DL, Chain, DescAddr,
+                  MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+                  false, true, true, 4);
+  Chain = FuncTLVGet.getValue(1);
+
+  MachineFunction &F = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = F.getFrameInfo();
+  MFI->setAdjustsStack(true);
+
+  // TLS calls preserve all registers except those that absolutely must be
+  // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
+  // silly).
+  auto TRI =
+      getTargetMachine().getSubtargetImpl(*F.getFunction())->getRegisterInfo();
+  auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
+  const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
+
+  // Finally, we can make the call. This is just a degenerate version of a
+  // normal AArch64 call node: r0 takes the address of the descriptor, and
+  // returns the address of the variable in this thread.
+  Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
+  Chain =
+      DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
+                  Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
+                  DAG.getRegisterMask(Mask), Chain.getValue(1));
+  return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
+}
+
 // Lower ISD::GlobalTLSAddress using the "general dynamic" model
 SDValue
 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
@@ -2631,9 +2711,11 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
 
 SDValue
 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
+  if (Subtarget->isTargetDarwin())
+    return LowerGlobalTLSAddressDarwin(Op, DAG);
+
   // TODO: implement the "local dynamic" model
-  assert(Subtarget->isTargetELF() &&
-         "TLS not implemented for non-ELF targets");
+  assert(Subtarget->isTargetELF() && "Only ELF implemented here");
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   if (DAG.getTarget().Options.EmulatedTLS)
     return LowerToTLSEmulatedModel(GA, DAG);
@@ -11407,7 +11489,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
         return;
 
       case 'J':
-        if (Subtarget->isThumb()) {  // FIXME thumb2
+        if (Subtarget->isThumb1Only()) {
           // This must be a constant between -255 and -1, for negated ADD
           // immediates. This can be used in GCC with an "n" modifier that
           // prints the negated value, for use with SUB instructions. It is
@@ -11476,7 +11558,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
         return;
 
       case 'M':
-        if (Subtarget->isThumb()) { // FIXME thumb2
+        if (Subtarget->isThumb1Only()) {
           // This must be a multiple of 4 between 0 and 1020, for
           // ADD sp + immediate.
           if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
@@ -12324,3 +12406,49 @@ unsigned ARMTargetLowering::getExceptionSelectorRegister(
   // via the personality function.
   return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1;
 }
+
+void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+  // Update IsSplitCSR in ARMFunctionInfo.
+  ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
+  AFI->setIsSplitCSR(true);
+}
+
+void ARMTargetLowering::insertCopiesSplitCSR(
+    MachineBasicBlock *Entry,
+    const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+  const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+  if (!IStart)
+    return;
+
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+  for (const MCPhysReg *I = IStart; *I; ++I) {
+    const TargetRegisterClass *RC = nullptr;
+    if (ARM::GPRRegClass.contains(*I))
+      RC = &ARM::GPRRegClass;
+    else if (ARM::DPRRegClass.contains(*I))
+      RC = &ARM::DPRRegClass;
+    else
+      llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+    unsigned NewVR = MRI->createVirtualRegister(RC);
+    // Create copy from CSR to a virtual register.
+    // FIXME: this currently does not emit CFI pseudo-instructions, it works
+    // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
+    // nounwind. If we want to generalize this later, we may need to emit
+    // CFI pseudo-instructions.
+    assert(Entry->getParent()->getFunction()->hasFnAttribute(
+               Attribute::NoUnwind) &&
+           "Function should be nounwind in insertCopiesSplitCSR!");
+    Entry->addLiveIn(*I);
+    BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
+            NewVR)
+        .addReg(*I);
+
+    for (auto *Exit : Exits)
+      BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
+              *I)
+          .addReg(NewVR);
+  }
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
index b764624f1492..96b56c3ec330 100644
--- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -526,6 +526,8 @@ namespace llvm {
     SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA,
                                  SelectionDAG &DAG,
                                  TLSModel::Model model) const;
+    SDValue LowerGlobalTLSAddressDarwin(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
@@ -578,6 +580,15 @@ namespace llvm {
                             SmallVectorImpl<SDValue> &InVals,
                             bool isThisReturn, SDValue ThisVal) const;
 
+    bool supportSplitCSR(MachineFunction *MF) const override {
+      return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+          MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
+    }
+    void initializeSplitCSR(MachineBasicBlock *Entry) const override;
+    void insertCopiesSplitCSR(
+      MachineBasicBlock *Entry,
+      const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+
     SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
index b9de83bfe6dc..c446ba3109e4 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -5398,6 +5398,27 @@ def MOV_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
                     Requires<[IsARM, UseMovt]>;
 } // isReMaterializable
 
+// The many different faces of TLS access.
+def : ARMPat<(ARMWrapper tglobaltlsaddr :$dst),
+             (MOVi32imm tglobaltlsaddr :$dst)>,
+      Requires<[IsARM, UseMovt]>;
+
+def : Pat<(ARMWrapper tglobaltlsaddr:$src),
+          (LDRLIT_ga_abs tglobaltlsaddr:$src)>,
+      Requires<[IsARM, DontUseMovt]>;
+
+def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr),
+          (MOV_ga_pcrel tglobaltlsaddr:$addr)>, Requires<[IsARM, UseMovt]>;
+
+def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr),
+          (LDRLIT_ga_pcrel tglobaltlsaddr:$addr)>,
+      Requires<[IsARM, DontUseMovt]>;
+let AddedComplexity = 10 in
+def : Pat<(load (ARMWrapperPIC tglobaltlsaddr:$addr)),
+          (MOV_ga_pcrel_ldr tglobaltlsaddr:$addr)>,
+      Requires<[IsARM, UseMovt]>;
+
+
 // ConstantPool, GlobalAddress, and JumpTable
 def : ARMPat<(ARMWrapper  tconstpool  :$dst), (LEApcrel tconstpool  :$dst)>;
 def : ARMPat<(ARMWrapper  tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>,
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
index 7020ffb41b64..defef4ea9073 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -5689,7 +5689,10 @@ def : NEONInstAlias<"vmov${p} $Vd, $Vm",
 
 //   VMOV     : Vector Move (Immediate)
 
-let isReMaterializable = 1 in {
+// Although VMOVs are not strictly speaking cheap, they are as expensive
+// as their copies counterpart (VORR), so we should prefer rematerialization
+// over splitting when it applies.
+let isReMaterializable = 1, isAsCheapAsAMove=1 in {
 def VMOVv8i8  : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$Vd),
                          (ins nImmSplatI8:$SIMM), IIC_VMOVImm,
                          "vmov", "i8", "$Vd, $SIMM", "",
@@ -5744,7 +5747,7 @@ def VMOVv4f32 : N1ModImm<1, 0b000, 0b1111, 0, 1, 0, 1, (outs QPR:$Vd),
                          (ins nImmVMOVF32:$SIMM), IIC_VMOVImm,
                          "vmov", "f32", "$Vd, $SIMM", "",
                          [(set QPR:$Vd, (v4f32 (NEONvmovFPImm timm:$SIMM)))]>;
-} // isReMaterializable
+} // isReMaterializable, isAsCheapAsAMove
 
 // Add support for bytes replication feature, so it could be GAS compatible.
 // E.g. instructions below:
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
index df6f24306354..5b1f9a06442e 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -1366,6 +1366,14 @@ def tLDRLIT_ga_abs : PseudoInst<(outs tGPR:$dst), (ins i32imm:$src),
                                       (ARMWrapper tglobaladdr:$src))]>,
                      Requires<[IsThumb, DontUseMovt]>;
 
+// TLS globals
+def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr),
+          (tLDRLIT_ga_pcrel tglobaltlsaddr:$addr)>,
+      Requires<[IsThumb, DontUseMovt]>;
+def : Pat<(ARMWrapper tglobaltlsaddr:$addr),
+          (tLDRLIT_ga_abs tglobaltlsaddr:$addr)>,
+      Requires<[IsThumb, DontUseMovt]>;
+
 
 // JumpTable
 def : T1Pat<(ARMWrapperJT tjumptable:$dst),
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
index d460d33fa0a3..f42f4569b2f8 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -3875,6 +3875,13 @@ def t2MOV_ga_pcrel : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr),
 
 }
 
+def : T2Pat<(ARMWrapperPIC tglobaltlsaddr :$dst),
+            (t2MOV_ga_pcrel tglobaltlsaddr:$dst)>,
+      Requires<[IsThumb2, UseMovt]>;
+def : T2Pat<(ARMWrapper tglobaltlsaddr:$dst),
+            (t2MOVi32imm tglobaltlsaddr:$dst)>,
+      Requires<[IsThumb2, UseMovt]>;
+
 // ConstantPool, GlobalAddress, and JumpTable
 def : T2Pat<(ARMWrapper  tconstpool  :$dst), (t2LEApcrel tconstpool  :$dst)>;
 def : T2Pat<(ARMWrapper  tglobaladdr :$dst), (t2MOVi32imm tglobaladdr :$dst)>,
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
index 050cd1a445ad..63e7940bb14e 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -930,10 +930,10 @@ def VMOVDRR : AVConv5I<0b11000100, 0b1011,
 // and could enable the conversion to float to be removed completely.
 def : Pat<(fabs (arm_fmdrr GPR:$Rl, GPR:$Rh)),
           (VMOVDRR GPR:$Rl, (BFC GPR:$Rh, (i32 0x7FFFFFFF)))>,
-      Requires<[IsARM]>;
+      Requires<[IsARM, HasV6T2]>;
 def : Pat<(fabs (arm_fmdrr GPR:$Rl, GPR:$Rh)),
           (VMOVDRR GPR:$Rl, (t2BFC GPR:$Rh, (i32 0x7FFFFFFF)))>,
-      Requires<[IsThumb2]>;
+      Requires<[IsThumb2, HasV6T2]>;
 def : Pat<(fneg (arm_fmdrr GPR:$Rl, GPR:$Rh)),
           (VMOVDRR GPR:$Rl, (EORri GPR:$Rh, (i32 0x80000000)))>,
       Requires<[IsARM]>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
index ac0330fbcb34..71ad7a4a732a 100644
--- a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
@@ -20,4 +20,5 @@ ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF)
       RestoreSPFromFP(false), LRSpilledForFarJump(false),
       FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
       GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
-      PICLabelUId(0), VarArgsFrameIndex(0), HasITBlocks(false) {}
+      PICLabelUId(0), VarArgsFrameIndex(0), HasITBlocks(false),
+      IsSplitCSR(false) {}
diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
index d6447978ef2c..68f9aec8cae5 100644
--- a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -118,6 +118,10 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// coalesced weights.
   DenseMap<const MachineBasicBlock*, unsigned> CoalescedWeights;
 
+  /// True if this function has a subset of CSRs that is handled explicitly via
+  /// copies.
+  bool IsSplitCSR;
+
 public:
   ARMFunctionInfo() :
     isThumb(false),
@@ -128,7 +132,7 @@ public:
     FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
     GPRCS1Size(0), GPRCS2Size(0), DPRCSAlignGapSize(0), DPRCSSize(0),
     NumAlignedDPRCS2Regs(0), PICLabelUId(0),
-    VarArgsFrameIndex(0), HasITBlocks(false) {}
+    VarArgsFrameIndex(0), HasITBlocks(false), IsSplitCSR(false) {}
 
   explicit ARMFunctionInfo(MachineFunction &MF);
 
@@ -199,6 +203,9 @@ public:
   bool hasITBlocks() const { return HasITBlocks; }
   void setHasITBlocks(bool h) { HasITBlocks = h; }
 
+  bool isSplitCSR() const { return IsSplitCSR; }
+  void setIsSplitCSR(bool s) { IsSplitCSR = s; }
+
   void recordCPEClone(unsigned CPIdx, unsigned CPCloneIdx) {
     if (!CPEClones.insert(std::make_pair(CPCloneIdx, CPIdx)).second)
       llvm_unreachable("Duplicate entries!");
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 6084f22c8470..57577dc834b7 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -388,6 +388,9 @@ private:
 
   size_t calculateContentSize() const;
 
+  // Reset state between object emissions
+  void reset() override;
+
 public:
   ARMTargetELFStreamer(MCStreamer &S)
     : ARMTargetStreamer(S), CurrentVendor("aeabi"), FPU(ARM::FK_INVALID),
@@ -415,7 +418,7 @@ public:
                  MCCodeEmitter *Emitter, bool IsThumb)
       : MCELFStreamer(Context, TAB, OS, Emitter), IsThumb(IsThumb),
         MappingSymbolCounter(0), LastEMS(EMS_None) {
-    Reset();
+    EHReset();
   }
 
   ~ARMELFStreamer() {}
@@ -579,7 +582,10 @@ private:
   }
 
   // Helper functions for ARM exception handling directives
-  void Reset();
+  void EHReset();
+
+  // Reset state between object emissions
+  void reset() override;
 
   void EmitPersonalityFixup(StringRef Name);
   void FlushPendingOffset();
@@ -1040,6 +1046,8 @@ void ARMTargetELFStreamer::emitInst(uint32_t Inst, char Suffix) {
   getStreamer().emitInst(Inst, Suffix);
 }
 
+void ARMTargetELFStreamer::reset() { AttributeSection = nullptr; }
+
 void ARMELFStreamer::FinishImpl() {
   MCTargetStreamer &TS = *getTargetStreamer();
   ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
@@ -1048,6 +1056,18 @@ void ARMELFStreamer::FinishImpl() {
   MCELFStreamer::FinishImpl();
 }
 
+void ARMELFStreamer::reset() {
+  MCTargetStreamer &TS = *getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+  ATS.reset();
+  MappingSymbolCounter = 0;
+  MCELFStreamer::reset();
+  // MCELFStreamer clear's the assembler's e_flags. However, for
+  // arm we manually set the ABI version on streamer creation, so
+  // do the same here
+  getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
+}
+
 inline void ARMELFStreamer::SwitchToEHSection(const char *Prefix,
                                               unsigned Type,
                                               unsigned Flags,
@@ -1094,7 +1114,7 @@ void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) {
                                               Kind));
 }
 
-void ARMELFStreamer::Reset() {
+void ARMELFStreamer::EHReset() {
   ExTab = nullptr;
   FnStart = nullptr;
   Personality = nullptr;
@@ -1164,7 +1184,7 @@ void ARMELFStreamer::emitFnEnd() {
   SwitchSection(&FnStart->getSection());
 
   // Clean exception handling frame information
-  Reset();
+  EHReset();
 }
 
 void ARMELFStreamer::emitCantUnwind() { CantUnwind = true; }
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index dad50f2834ee..c0d10c896354 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -38,6 +38,9 @@ void ARMTargetStreamer::emitCurrentConstantPool() {
 // finish() - write out any non-empty assembler constant pools.
 void ARMTargetStreamer::finish() { ConstantPools->emitAll(Streamer); }
 
+// reset() - Reset any state
+void ARMTargetStreamer::reset() {}
+
 // The remaining callbacks should be handled separately by each
 // streamer.
 void ARMTargetStreamer::emitFnStart() {}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h
index 812f9830824d..27faac63683a 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h
+++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h
@@ -53,6 +53,11 @@ public:
   /// \p MBB will be correctly handled by the target.
   bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
 
+  /// Disable shrink wrap as tBfar/BL will be used to adjust for long jumps.
+  bool enableShrinkWrapping(const MachineFunction &MF) const override {
+    return false;
+  }
+
 private:
   /// Check if the frame lowering of \p MF needs a special fixup
   /// code sequence for the epilogue.
diff --git a/contrib/llvm/lib/Target/AVR/AVR.h b/contrib/llvm/lib/Target/AVR/AVR.h
new file mode 100644
index 000000000000..4c1667ed341c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVR.h
@@ -0,0 +1,54 @@
+//===-- AVR.h - Top-level interface for AVR representation ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// AVR back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_H
+#define LLVM_AVR_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+
+namespace llvm {
+
+class AVRTargetMachine;
+class FunctionPass;
+
+FunctionPass *createAVRISelDag(AVRTargetMachine &TM,
+                               CodeGenOpt::Level OptLevel);
+FunctionPass *createAVRExpandPseudoPass();
+FunctionPass *createAVRFrameAnalyzerPass();
+FunctionPass *createAVRDynAllocaSRPass();
+FunctionPass *createAVRBranchSelectionPass();
+
+/**
+ * Contains the AVR backend.
+ */
+namespace AVR {
+
+enum AddressSpace { DataMemory, ProgramMemory };
+
+template <typename T> bool isProgramMemoryAddress(T *V) {
+  return cast<PointerType>(V->getType())->getAddressSpace() == ProgramMemory;
+}
+
+inline bool isProgramMemoryAccess(MemSDNode const *N) {
+  auto V = N->getMemOperand()->getValue();
+
+  return (V != nullptr) ? isProgramMemoryAddress(V) : false;
+}
+
+} // end of namespace AVR
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_H
diff --git a/contrib/llvm/lib/Target/AVR/AVRSelectionDAGInfo.h b/contrib/llvm/lib/Target/AVR/AVRSelectionDAGInfo.h
new file mode 100644
index 000000000000..ee832ad1bc75
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRSelectionDAGInfo.h
@@ -0,0 +1,29 @@
+//===-- AVRSelectionDAGInfo.h - AVR SelectionDAG Info -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AVR subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_SELECTION_DAG_INFO_H
+#define LLVM_AVR_SELECTION_DAG_INFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+/**
+ * Holds information about the AVR instruction selection DAG.
+ */
+class AVRSelectionDAGInfo : public TargetSelectionDAGInfo {
+public:
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_SELECTION_DAG_INFO_H
diff --git a/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp
new file mode 100644
index 000000000000..85f03e818e83
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp
@@ -0,0 +1,40 @@
+//===-- AVRTargetObjectFile.cpp - AVR Object Files ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRTargetObjectFile.h"
+
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/ELF.h"
+
+#include "AVR.h"
+
+namespace llvm {
+void AVRTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) {
+  Base::Initialize(Ctx, TM);
+  ProgmemDataSection =
+      Ctx.getELFSection(".progmem.data", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+}
+
+MCSection *
+AVRTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
+                                            SectionKind Kind, Mangler &Mang,
+                                            const TargetMachine &TM) const {
+  // Global values in flash memory are placed in the progmem.data section
+  // unless they already have a user assigned section.
+  if (AVR::isProgramMemoryAddress(GV) && !GV->hasSection())
+    return ProgmemDataSection;
+
+  // Otherwise, we work the same way as ELF.
+  return Base::SelectSectionForGlobal(GV, Kind, Mang, TM);
+}
+} // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.h b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.h
new file mode 100644
index 000000000000..bdda35b34993
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.h
@@ -0,0 +1,35 @@
+//===-- AVRTargetObjectFile.h - AVR Object Info -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_TARGET_OBJECT_FILE_H
+#define LLVM_AVR_TARGET_OBJECT_FILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+/**
+ * Lowering for an AVR ELF32 object file.
+ */
+class AVRTargetObjectFile : public TargetLoweringObjectFileELF {
+  typedef TargetLoweringObjectFileELF Base;
+
+public:
+  void Initialize(MCContext &ctx, const TargetMachine &TM) override;
+
+  MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                                    Mangler &Mang,
+                                    const TargetMachine &TM) const override;
+
+private:
+  MCSection *ProgmemDataSection;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_TARGET_OBJECT_FILE_H
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index e213089687e8..4c7c0392a132 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -190,9 +190,9 @@ bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   return false;
 }
 
-MCSymbol *smallData(AsmPrinter &AP, const MachineInstr &MI,
-		    MCStreamer &OutStreamer,
-                    const MCOperand &Imm, int AlignSize) {
+static MCSymbol *smallData(AsmPrinter &AP, const MachineInstr &MI,
+                           MCStreamer &OutStreamer, const MCOperand &Imm,
+                           int AlignSize) {
   MCSymbol *Sym;
   int64_t Value;
   if (Imm.getExpr()->evaluateAsAbsolute(Value)) {
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 77907b054d54..4d2b54521e83 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -1275,6 +1275,8 @@ bool RedundantInstrElimination::processBlock(MachineBasicBlock &B,
     if (!BT.has(RD.Reg))
       continue;
     const BitTracker::RegisterCell &DC = BT.lookup(RD.Reg);
+    auto At = MI->isPHI() ? B.getFirstNonPHI()
+                          : MachineBasicBlock::iterator(MI);
 
     // Find a source operand that is equal to the result.
     for (auto &Op : MI->uses()) {
@@ -1298,7 +1300,7 @@ bool RedundantInstrElimination::processBlock(MachineBasicBlock &B,
       DebugLoc DL = MI->getDebugLoc();
       const TargetRegisterClass *FRC = HBS::getFinalVRegClass(RD, MRI);
       unsigned NewR = MRI.createVirtualRegister(FRC);
-      BuildMI(B, I, DL, HII.get(TargetOpcode::COPY), NewR)
+      BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR)
           .addReg(RS.Reg, 0, RS.Sub);
       HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI);
       BT.put(BitTracker::RegisterRef(NewR), SC);
@@ -1925,7 +1927,9 @@ bool BitSimplification::genPackhl(MachineInstr *MI,
   MachineBasicBlock &B = *MI->getParent();
   unsigned NewR = MRI.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
   DebugLoc DL = MI->getDebugLoc();
-  BuildMI(B, MI, DL, HII.get(Hexagon::S2_packhl), NewR)
+  auto At = MI->isPHI() ? B.getFirstNonPHI()
+                        : MachineBasicBlock::iterator(MI);
+  BuildMI(B, At, DL, HII.get(Hexagon::S2_packhl), NewR)
       .addReg(Rs.Reg, 0, Rs.Sub)
       .addReg(Rt.Reg, 0, Rt.Sub);
   HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI);
@@ -1950,9 +1954,11 @@ bool BitSimplification::genExtractHalf(MachineInstr *MI,
   // Prefer zxth, since zxth can go in any slot, while extractu only in
   // slots 2 and 3.
   unsigned NewR = 0;
+  auto At = MI->isPHI() ? B.getFirstNonPHI()
+                        : MachineBasicBlock::iterator(MI);
   if (L.Low && Opc != Hexagon::A2_zxth) {
     NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
-    BuildMI(B, MI, DL, HII.get(Hexagon::A2_zxth), NewR)
+    BuildMI(B, At, DL, HII.get(Hexagon::A2_zxth), NewR)
         .addReg(L.Reg, 0, L.Sub);
   } else if (!L.Low && Opc != Hexagon::S2_extractu) {
     NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
@@ -1989,7 +1995,9 @@ bool BitSimplification::genCombineHalf(MachineInstr *MI,
   MachineBasicBlock &B = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
   unsigned NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
-  BuildMI(B, MI, DL, HII.get(COpc), NewR)
+  auto At = MI->isPHI() ? B.getFirstNonPHI()
+                        : MachineBasicBlock::iterator(MI);
+  BuildMI(B, At, DL, HII.get(COpc), NewR)
       .addReg(H.Reg, 0, H.Sub)
       .addReg(L.Reg, 0, L.Sub);
   HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI);
@@ -2043,7 +2051,9 @@ bool BitSimplification::genExtractLow(MachineInstr *MI,
       continue;
 
     unsigned NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
-    auto MIB = BuildMI(B, MI, DL, HII.get(NewOpc), NewR)
+    auto At = MI->isPHI() ? B.getFirstNonPHI()
+                          : MachineBasicBlock::iterator(MI);
+    auto MIB = BuildMI(B, At, DL, HII.get(NewOpc), NewR)
                   .addReg(RS.Reg, 0, RS.Sub);
     if (NewOpc == Hexagon::A2_andir)
       MIB.addImm((1 << W) - 1);
@@ -2076,6 +2086,8 @@ bool BitSimplification::simplifyTstbit(MachineInstr *MI,
   if (!BT.has(RS.Reg) || !HBS::getSubregMask(RS, F, W, MRI))
     return false;
   MachineBasicBlock &B = *MI->getParent();
+  auto At = MI->isPHI() ? B.getFirstNonPHI()
+                        : MachineBasicBlock::iterator(MI);
 
   const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg);
   const BitTracker::BitValue &V = SC[F+BN];
@@ -2098,7 +2110,7 @@ bool BitSimplification::simplifyTstbit(MachineInstr *MI,
     }
     if (P != UINT_MAX) {
       unsigned NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
-      BuildMI(B, MI, DL, HII.get(Hexagon::S2_tstbit_i), NewR)
+      BuildMI(B, At, DL, HII.get(Hexagon::S2_tstbit_i), NewR)
           .addReg(RR.Reg, 0, RR.Sub)
           .addImm(P);
       HBS::replaceReg(RD.Reg, NewR, MRI);
@@ -2108,7 +2120,7 @@ bool BitSimplification::simplifyTstbit(MachineInstr *MI,
   } else if (V.is(0) || V.is(1)) {
     unsigned NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
     unsigned NewOpc = V.is(0) ? Hexagon::TFR_PdFalse : Hexagon::TFR_PdTrue;
-    BuildMI(B, MI, DL, HII.get(NewOpc), NewR);
+    BuildMI(B, At, DL, HII.get(NewOpc), NewR);
     HBS::replaceReg(RD.Reg, NewR, MRI);
     return true;
   }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td
index 87d6b359f5fb..37c2042a2ccd 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -3320,6 +3320,7 @@ class T_StoreAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp,
                                       /* u16_0Imm */ addr{15-0})));
     // Store upper-half and store doubleword cannot be NV.
     let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
+    let Uses = !if (isAbs, [], [GP]);
 
     let IClass = 0b0100;
     let Inst{27} = 1;
@@ -3425,6 +3426,7 @@ class T_StoreAbsGP_NV <string mnemonic, Operand ImmOp, bits<2>MajOp, bit isAbs>
                      !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2},
                      !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1},
                                       /* u16_0Imm */ addr{15-0})));
+    let Uses = !if (isAbs, [], [GP]);
     let IClass = 0b0100;
 
     let Inst{27} = 1;
@@ -3736,7 +3738,7 @@ defm loadrd  : LD_Abs<"memd",  "LDrid", DoubleRegs, u16_3Imm, 0b110>;
 // if ([!]Pv[.new]) Rx=mem[bhwd](##global)
 //===----------------------------------------------------------------------===//
 
-let isAsmParserOnly = 1 in
+let isAsmParserOnly = 1, Uses = [GP] in
 class T_LoadGP <string mnemonic, string BaseOp, RegisterClass RC, Operand ImmOp,
                 bits<3> MajOp>
   : T_LoadAbsGP <mnemonic, RC, ImmOp, MajOp>, PredNewRel {
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRDF.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRDF.cpp
new file mode 100644
index 000000000000..06719cddf4b6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRDF.cpp
@@ -0,0 +1,60 @@
+//===--- HexagonRDF.cpp ---------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonRDF.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
+
+#include "llvm/CodeGen/MachineInstr.h"
+
+using namespace llvm;
+using namespace rdf;
+
+bool HexagonRegisterAliasInfo::covers(RegisterRef RA, RegisterRef RB) const {
+  if (RA == RB)
+    return true;
+
+  if (TargetRegisterInfo::isVirtualRegister(RA.Reg) &&
+      TargetRegisterInfo::isVirtualRegister(RB.Reg)) {
+    // Hexagon-specific cases.
+    if (RA.Reg == RB.Reg) {
+      if (RA.Sub == 0)
+        return true;
+      if (RB.Sub == 0)
+        return false;
+    }
+  }
+
+  return RegisterAliasInfo::covers(RA, RB);
+}
+
+bool HexagonRegisterAliasInfo::covers(const RegisterSet &RRs, RegisterRef RR)
+      const {
+  if (RRs.count(RR))
+    return true;
+
+  if (!TargetRegisterInfo::isPhysicalRegister(RR.Reg)) {
+    assert(TargetRegisterInfo::isVirtualRegister(RR.Reg));
+    // Check if both covering subregisters are present.
+    bool HasLo = RRs.count({RR.Reg, Hexagon::subreg_loreg});
+    bool HasHi = RRs.count({RR.Reg, Hexagon::subreg_hireg});
+    if (HasLo && HasHi)
+      return true;
+  }
+
+  if (RR.Sub == 0) {
+    // Check if both covering subregisters are present.
+    unsigned Lo = TRI.getSubReg(RR.Reg, Hexagon::subreg_loreg);
+    unsigned Hi = TRI.getSubReg(RR.Reg, Hexagon::subreg_hireg);
+    if (RRs.count({Lo, 0}) && RRs.count({Hi, 0}))
+      return true;
+  }
+
+  return RegisterAliasInfo::covers(RRs, RR);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRDF.h b/contrib/llvm/lib/Target/Hexagon/HexagonRDF.h
new file mode 100644
index 000000000000..00c1889e8eb5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRDF.h
@@ -0,0 +1,28 @@
+//===--- HexagonRDF.h -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGON_RDF_H
+#define HEXAGON_RDF_H
+#include "RDFGraph.h"
+
+namespace llvm {
+  class TargetRegisterInfo;
+}
+
+namespace rdf {
+  struct HexagonRegisterAliasInfo : public RegisterAliasInfo {
+    HexagonRegisterAliasInfo(const TargetRegisterInfo &TRI)
+      : RegisterAliasInfo(TRI) {}
+    bool covers(RegisterRef RA, RegisterRef RR) const override;
+    bool covers(const RegisterSet &RRs, RegisterRef RR) const override;
+  };
+}
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
new file mode 100644
index 000000000000..3fcda984d265
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
@@ -0,0 +1,272 @@
+//===--- HexagonRDFOpt.cpp ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonInstrInfo.h"
+#include "HexagonRDF.h"
+#include "HexagonSubtarget.h"
+#include "RDFCopy.h"
+#include "RDFDeadCode.h"
+#include "RDFGraph.h"
+#include "RDFLiveness.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+using namespace rdf;
+
+namespace llvm {
+  void initializeHexagonRDFOptPass(PassRegistry&);
+  FunctionPass *createHexagonRDFOpt();
+}
+
+namespace {
+  cl::opt<unsigned> RDFLimit("rdf-limit", cl::init(UINT_MAX));
+  unsigned RDFCount = 0;
+  cl::opt<bool> RDFDump("rdf-dump", cl::init(false));
+
+  class HexagonRDFOpt : public MachineFunctionPass {
+  public:
+    HexagonRDFOpt() : MachineFunctionPass(ID) {
+      initializeHexagonRDFOptPass(*PassRegistry::getPassRegistry());
+    }
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<MachineDominanceFrontier>();
+      AU.setPreservesAll();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+    const char *getPassName() const override {
+      return "Hexagon RDF optimizations";
+    }
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    static char ID;
+
+  private:
+    MachineDominatorTree *MDT;
+    MachineRegisterInfo *MRI;
+  };
+
+  char HexagonRDFOpt::ID = 0;
+}
+
+INITIALIZE_PASS_BEGIN(HexagonRDFOpt, "rdfopt", "Hexagon RDF opt", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
+INITIALIZE_PASS_END(HexagonRDFOpt, "rdfopt", "Hexagon RDF opt", false, false)
+
+
+struct HexagonDCE : public DeadCodeElimination {
+  HexagonDCE(DataFlowGraph &G, MachineRegisterInfo &MRI)
+    : DeadCodeElimination(G, MRI) {}
+  bool rewrite(NodeAddr<InstrNode*> IA, SetVector<NodeId> &Remove);
+  void removeOperand(NodeAddr<InstrNode*> IA, unsigned OpNum);
+
+  bool run();
+};
+
+
+bool HexagonDCE::run() {
+  bool Collected = collect();
+  if (!Collected)
+    return false;
+
+  const SetVector<NodeId> &DeadNodes = getDeadNodes();
+  const SetVector<NodeId> &DeadInstrs = getDeadInstrs();
+
+  typedef DenseMap<NodeId,NodeId> RefToInstrMap;
+  RefToInstrMap R2I;
+  SetVector<NodeId> PartlyDead;
+  DataFlowGraph &DFG = getDFG();
+
+  for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) {
+    for (auto TA : BA.Addr->members_if(DFG.IsCode<NodeAttrs::Stmt>, DFG)) {
+      NodeAddr<StmtNode*> SA = TA;
+      for (NodeAddr<RefNode*> RA : SA.Addr->members(DFG)) {
+        R2I.insert(std::make_pair(RA.Id, SA.Id));
+        if (DFG.IsDef(RA) && DeadNodes.count(RA.Id))
+          if (!DeadInstrs.count(SA.Id))
+            PartlyDead.insert(SA.Id);
+      }
+    }
+  }
+
+  // Nodes to remove.
+  SetVector<NodeId> Remove = DeadInstrs;
+
+  bool Changed = false;
+  for (NodeId N : PartlyDead) {
+    auto SA = DFG.addr<StmtNode*>(N);
+    if (trace())
+      dbgs() << "Partly dead: " << *SA.Addr->getCode();
+    Changed |= rewrite(SA, Remove);
+  }
+
+  return erase(Remove) || Changed;
+}
+
+
+void HexagonDCE::removeOperand(NodeAddr<InstrNode*> IA, unsigned OpNum) {
+  MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode();
+
+  auto getOpNum = [MI] (MachineOperand &Op) -> unsigned {
+    for (unsigned i = 0, n = MI->getNumOperands(); i != n; ++i)
+      if (&MI->getOperand(i) == &Op)
+        return i;
+    llvm_unreachable("Invalid operand");
+  };
+  DenseMap<NodeId,unsigned> OpMap;
+  NodeList Refs = IA.Addr->members(getDFG());
+  for (NodeAddr<RefNode*> RA : Refs)
+    OpMap.insert(std::make_pair(RA.Id, getOpNum(RA.Addr->getOp())));
+
+  MI->RemoveOperand(OpNum);
+
+  for (NodeAddr<RefNode*> RA : Refs) {
+    unsigned N = OpMap[RA.Id];
+    if (N < OpNum)
+      RA.Addr->setRegRef(&MI->getOperand(N));
+    else if (N > OpNum)
+      RA.Addr->setRegRef(&MI->getOperand(N-1));
+  }
+}
+
+
+bool HexagonDCE::rewrite(NodeAddr<InstrNode*> IA, SetVector<NodeId> &Remove) {
+  if (!getDFG().IsCode<NodeAttrs::Stmt>(IA))
+    return false;
+  DataFlowGraph &DFG = getDFG();
+  MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode();
+  auto &HII = static_cast<const HexagonInstrInfo&>(DFG.getTII());
+  if (HII.getAddrMode(MI) != HexagonII::PostInc)
+    return false;
+  unsigned Opc = MI->getOpcode();
+  unsigned OpNum, NewOpc;
+  switch (Opc) {
+    case Hexagon::L2_loadri_pi:
+      NewOpc = Hexagon::L2_loadri_io;
+      OpNum = 1;
+      break;
+    case Hexagon::L2_loadrd_pi:
+      NewOpc = Hexagon::L2_loadrd_io;
+      OpNum = 1;
+      break;
+    case Hexagon::V6_vL32b_pi:
+      NewOpc = Hexagon::V6_vL32b_ai;
+      OpNum = 1;
+      break;
+    case Hexagon::S2_storeri_pi:
+      NewOpc = Hexagon::S2_storeri_io;
+      OpNum = 0;
+      break;
+    case Hexagon::S2_storerd_pi:
+      NewOpc = Hexagon::S2_storerd_io;
+      OpNum = 0;
+      break;
+    case Hexagon::V6_vS32b_pi:
+      NewOpc = Hexagon::V6_vS32b_ai;
+      OpNum = 0;
+      break;
+    default:
+      return false;
+  }
+  auto IsDead = [this] (NodeAddr<DefNode*> DA) -> bool {
+    return getDeadNodes().count(DA.Id);
+  };
+  NodeList Defs;
+  MachineOperand &Op = MI->getOperand(OpNum);
+  for (NodeAddr<DefNode*> DA : IA.Addr->members_if(DFG.IsDef, DFG)) {
+    if (&DA.Addr->getOp() != &Op)
+      continue;
+    Defs = DFG.getRelatedRefs(IA, DA);
+    if (!std::all_of(Defs.begin(), Defs.end(), IsDead))
+      return false;
+    break;
+  }
+
+  // Mark all nodes in Defs for removal.
+  for (auto D : Defs)
+    Remove.insert(D.Id);
+
+  if (trace())
+    dbgs() << "Rewriting: " << *MI;
+  MI->setDesc(HII.get(NewOpc));
+  MI->getOperand(OpNum+2).setImm(0);
+  removeOperand(IA, OpNum);
+  if (trace())
+    dbgs() << "       to: " << *MI;
+
+  return true;
+}
+
+
+bool HexagonRDFOpt::runOnMachineFunction(MachineFunction &MF) {
+  if (RDFLimit.getPosition()) {
+    if (RDFCount >= RDFLimit)
+      return false;
+    RDFCount++;
+  }
+
+  MDT = &getAnalysis<MachineDominatorTree>();
+  const auto &MDF = getAnalysis<MachineDominanceFrontier>();
+  const auto &HII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+  const auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  HexagonRegisterAliasInfo HAI(HRI);
+  TargetOperandInfo TOI(HII);
+
+  if (RDFDump)
+    MF.print(dbgs() << "Before " << getPassName() << "\n", nullptr);
+  DataFlowGraph G(MF, HII, HRI, *MDT, MDF, HAI, TOI);
+  G.build();
+  if (RDFDump) {
+    dbgs() << PrintNode<FuncNode*>(G.getFunc(), G) << '\n';
+    dbgs() << MF.getName() << '\n';
+  }
+
+  bool Changed;
+  CopyPropagation CP(G);
+  CP.trace(RDFDump);
+  Changed = CP.run();
+  if (Changed)
+    G.build();
+
+  HexagonDCE DCE(G, *MRI);
+  DCE.trace(RDFDump);
+  Changed |= DCE.run();
+
+  if (Changed) {
+    Liveness LV(*MRI, G);
+    LV.trace(RDFDump);
+    LV.computeLiveIns();
+    LV.resetLiveIns();
+    LV.resetKills();
+  }
+
+  if (RDFDump)
+    MF.print(dbgs() << "After " << getPassName() << "\n", nullptr);
+  return false;
+}
+
+
+FunctionPass *llvm::createHexagonRDFOpt() {
+  return new HexagonRDFOpt();
+}
+
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 61c0589fb5bf..6e5f7324aca8 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -103,6 +103,8 @@ BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF)
   Reserved.set(Hexagon::R30);
   Reserved.set(Hexagon::R31);
   Reserved.set(Hexagon::PC);
+  Reserved.set(Hexagon::GP);
+  Reserved.set(Hexagon::D14);
   Reserved.set(Hexagon::D15);
   Reserved.set(Hexagon::LC0);
   Reserved.set(Hexagon::LC1);
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 9dccd696c989..34b03fb74cef 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -26,7 +26,11 @@
 
 using namespace llvm;
 
-static cl:: opt<bool> DisableHardwareLoops("disable-hexagon-hwloops",
+
+static cl::opt<bool> EnableRDFOpt("rdf-opt", cl::Hidden, cl::ZeroOrMore,
+  cl::init(true), cl::desc("Enable RDF-based optimizations"));
+
+static cl::opt<bool> DisableHardwareLoops("disable-hexagon-hwloops",
   cl::Hidden, cl::desc("Disable Hardware Loops for Hexagon target"));
 
 static cl::opt<bool> DisableHexagonCFGOpt("disable-hexagon-cfgopt",
@@ -111,6 +115,7 @@ namespace llvm {
   FunctionPass *createHexagonOptimizeSZextends();
   FunctionPass *createHexagonPacketizer();
   FunctionPass *createHexagonPeephole();
+  FunctionPass *createHexagonRDFOpt();
   FunctionPass *createHexagonSplitConst32AndConst64();
   FunctionPass *createHexagonSplitDoubleRegs();
   FunctionPass *createHexagonStoreWidening();
@@ -262,9 +267,12 @@ void HexagonPassConfig::addPreRegAlloc() {
 }
 
 void HexagonPassConfig::addPostRegAlloc() {
-  if (getOptLevel() != CodeGenOpt::None)
+  if (getOptLevel() != CodeGenOpt::None) {
+    if (EnableRDFOpt)
+      addPass(createHexagonRDFOpt());
     if (!DisableHexagonCFGOpt)
       addPass(createHexagonCFGOptimizer(), false);
+  }
 }
 
 void HexagonPassConfig::addPreSched2() {
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
index c2c6275e7e8d..4b07ca7490a8 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
@@ -334,21 +334,21 @@ static Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
   // The only relocs left should be GP relative:
   default:
     if (MCID.mayStore() || MCID.mayLoad()) {
-      for (const MCPhysReg *ImpUses = MCID.getImplicitUses(); *ImpUses;
-           ++ImpUses) {
-        if (*ImpUses == Hexagon::GP) {
-          switch (HexagonMCInstrInfo::getAccessSize(MCII, MI)) {
-          case HexagonII::MemAccessSize::ByteAccess:
-            return fixup_Hexagon_GPREL16_0;
-          case HexagonII::MemAccessSize::HalfWordAccess:
-            return fixup_Hexagon_GPREL16_1;
-          case HexagonII::MemAccessSize::WordAccess:
-            return fixup_Hexagon_GPREL16_2;
-          case HexagonII::MemAccessSize::DoubleWordAccess:
-            return fixup_Hexagon_GPREL16_3;
-          default:
-            llvm_unreachable("unhandled fixup");
-          }
+      for (const MCPhysReg *ImpUses = MCID.getImplicitUses();
+           ImpUses && *ImpUses; ++ImpUses) {
+        if (*ImpUses != Hexagon::GP)
+          continue;
+        switch (HexagonMCInstrInfo::getAccessSize(MCII, MI)) {
+        case HexagonII::MemAccessSize::ByteAccess:
+          return fixup_Hexagon_GPREL16_0;
+        case HexagonII::MemAccessSize::HalfWordAccess:
+          return fixup_Hexagon_GPREL16_1;
+        case HexagonII::MemAccessSize::WordAccess:
+          return fixup_Hexagon_GPREL16_2;
+        case HexagonII::MemAccessSize::DoubleWordAccess:
+          return fixup_Hexagon_GPREL16_3;
+        default:
+          llvm_unreachable("unhandled fixup");
         }
       }
     } else
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
index 6ceb848ba20c..4e1cce3bd7d1 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
@@ -95,14 +95,7 @@ unsigned HexagonResource::setWeight(unsigned s) {
   return (Weight);
 }
 
-HexagonCVIResource::TypeUnitsAndLanes *HexagonCVIResource::TUL;
-
-bool HexagonCVIResource::SetUp = HexagonCVIResource::setup();
-
-bool HexagonCVIResource::setup() {
-  assert(!TUL);
-  TUL = new (TypeUnitsAndLanes);
-
+void HexagonCVIResource::SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU) {
   (*TUL)[HexagonII::TypeCVI_VA] =
       UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
   (*TUL)[HexagonII::TypeCVI_VA_DV] = UnitsAndLanes(CVI_XLANE | CVI_MPY0, 2);
@@ -123,13 +116,12 @@ bool HexagonCVIResource::setup() {
   (*TUL)[HexagonII::TypeCVI_VM_NEW_ST] = UnitsAndLanes(CVI_NONE, 0);
   (*TUL)[HexagonII::TypeCVI_VM_STU] = UnitsAndLanes(CVI_XLANE, 1);
   (*TUL)[HexagonII::TypeCVI_HIST] = UnitsAndLanes(CVI_XLANE, 4);
-
-  return true;
 }
 
-HexagonCVIResource::HexagonCVIResource(MCInstrInfo const &MCII, unsigned s,
+HexagonCVIResource::HexagonCVIResource(TypeUnitsAndLanes *TUL,
+                                       MCInstrInfo const &MCII, unsigned s,
                                        MCInst const *id)
-    : HexagonResource(s) {
+    : HexagonResource(s), TUL(TUL) {
   unsigned T = HexagonMCInstrInfo::getType(MCII, *id);
 
   if (TUL->count(T)) {
@@ -153,6 +145,7 @@ HexagonShuffler::HexagonShuffler(MCInstrInfo const &MCII,
                                  MCSubtargetInfo const &STI)
     : MCII(MCII), STI(STI) {
   reset();
+  HexagonCVIResource::SetupTUL(&TUL, STI.getCPU());
 }
 
 void HexagonShuffler::reset() {
@@ -163,7 +156,7 @@ void HexagonShuffler::reset() {
 
 void HexagonShuffler::append(MCInst const *ID, MCInst const *Extender,
                              unsigned S, bool X) {
-  HexagonInstr PI(MCII, ID, Extender, S, X);
+  HexagonInstr PI(&TUL, MCII, ID, Extender, S, X);
 
   Packet.push_back(PI);
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
index 174f10fb2580..a093f8545132 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
@@ -20,6 +20,7 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -53,9 +54,11 @@ public:
 
 // HVX insn resources.
 class HexagonCVIResource : public HexagonResource {
+public:
   typedef std::pair<unsigned, unsigned> UnitsAndLanes;
   typedef llvm::DenseMap<unsigned, UnitsAndLanes> TypeUnitsAndLanes;
 
+private:
   // Available HVX slots.
   enum {
     CVI_NONE = 0,
@@ -65,9 +68,7 @@ class HexagonCVIResource : public HexagonResource {
     CVI_MPY1 = 1 << 3
   };
 
-  static bool SetUp;
-  static bool setup();
-  static TypeUnitsAndLanes *TUL;
+  TypeUnitsAndLanes *TUL;
 
   // Count of adjacent slots that the insn requires to be executed.
   unsigned Lanes;
@@ -81,7 +82,9 @@ class HexagonCVIResource : public HexagonResource {
   void setStore(bool f = true) { Store = f; };
 
 public:
-  HexagonCVIResource(MCInstrInfo const &MCII, unsigned s, MCInst const *id);
+  HexagonCVIResource(TypeUnitsAndLanes *TUL, MCInstrInfo const &MCII,
+                     unsigned s, MCInst const *id);
+  static void SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU);
 
   bool isValid() const { return (Valid); };
   unsigned getLanes() const { return (Lanes); };
@@ -100,10 +103,11 @@ class HexagonInstr {
   bool SoloException;
 
 public:
-  HexagonInstr(MCInstrInfo const &MCII, MCInst const *id,
+  HexagonInstr(HexagonCVIResource::TypeUnitsAndLanes *T,
+               MCInstrInfo const &MCII, MCInst const *id,
                MCInst const *Extender, unsigned s, bool x = false)
-      : ID(id), Extender(Extender), Core(s), CVI(MCII, s, id),
-        SoloException(x){};
+      : ID(id), Extender(Extender), Core(s), CVI(T, MCII, s, id),
+        SoloException(x) {};
 
   MCInst const *getDesc() const { return (ID); };
 
@@ -136,6 +140,8 @@ class HexagonShuffler {
   // Shuffling error code.
   unsigned Error;
 
+  HexagonCVIResource::TypeUnitsAndLanes TUL;
+
 protected:
   int64_t BundleFlags;
   MCInstrInfo const &MCII;
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp b/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp
new file mode 100644
index 000000000000..c547c7195075
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp
@@ -0,0 +1,180 @@
+//===--- RDFCopy.cpp ------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Simplistic RDF-based copy propagation.
+
+#include "RDFCopy.h"
+#include "RDFGraph.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Support/CommandLine.h"
+
+#include <atomic>
+
+#ifndef NDEBUG
+static cl::opt<unsigned> CpLimit("rdf-cp-limit", cl::init(0), cl::Hidden);
+static unsigned CpCount = 0;
+#endif
+
+using namespace llvm;
+using namespace rdf;
+
+void CopyPropagation::recordCopy(NodeAddr<StmtNode*> SA, MachineInstr *MI) {
+  assert(MI->getOpcode() == TargetOpcode::COPY);
+  const MachineOperand &Op0 = MI->getOperand(0), &Op1 = MI->getOperand(1);
+  RegisterRef DstR = { Op0.getReg(), Op0.getSubReg() };
+  RegisterRef SrcR = { Op1.getReg(), Op1.getSubReg() };
+  auto FS = DefM.find(SrcR);
+  if (FS == DefM.end() || FS->second.empty())
+    return;
+  Copies.push_back(SA.Id);
+  RDefMap[SrcR][SA.Id] = FS->second.top()->Id;
+  // Insert DstR into the map.
+  RDefMap[DstR];
+}
+
+
+void CopyPropagation::updateMap(NodeAddr<InstrNode*> IA) {
+  RegisterSet RRs;
+  for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG))
+    RRs.insert(RA.Addr->getRegRef());
+  bool Common = false;
+  for (auto &R : RDefMap) {
+    if (!RRs.count(R.first))
+      continue;
+    Common = true;
+    break;
+  }
+  if (!Common)
+    return;
+
+  for (auto &R : RDefMap) {
+    if (!RRs.count(R.first))
+      continue;
+    auto F = DefM.find(R.first);
+    if (F == DefM.end() || F->second.empty())
+      continue;
+    R.second[IA.Id] = F->second.top()->Id;
+  }
+}
+
+
+bool CopyPropagation::scanBlock(MachineBasicBlock *B) {
+  bool Changed = false;
+  auto BA = DFG.getFunc().Addr->findBlock(B, DFG);
+  DFG.markBlock(BA.Id, DefM);
+
+  for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) {
+    if (DFG.IsCode<NodeAttrs::Stmt>(IA)) {
+      NodeAddr<StmtNode*> SA = IA;
+      MachineInstr *MI = SA.Addr->getCode();
+      if (MI->isCopy())
+        recordCopy(SA, MI);
+    }
+
+    updateMap(IA);
+    DFG.pushDefs(IA, DefM);
+  }
+
+  MachineDomTreeNode *N = MDT.getNode(B);
+  for (auto I : *N)
+    Changed |= scanBlock(I->getBlock());
+
+  DFG.releaseBlock(BA.Id, DefM);
+  return Changed;
+}
+
+
+bool CopyPropagation::run() {
+  scanBlock(&DFG.getMF().front());
+
+  if (trace()) {
+    dbgs() << "Copies:\n";
+    for (auto I : Copies)
+      dbgs() << *DFG.addr<StmtNode*>(I).Addr->getCode();
+    dbgs() << "\nRDef map:\n";
+    for (auto R : RDefMap) {
+      dbgs() << Print<RegisterRef>(R.first, DFG) << " -> {";
+      for (auto &M : R.second)
+        dbgs() << ' ' << Print<NodeId>(M.first, DFG) << ':'
+               << Print<NodeId>(M.second, DFG);
+      dbgs() << " }\n";
+    }
+  }
+
+  bool Changed = false;
+  NodeSet Deleted;
+#ifndef NDEBUG
+  bool HasLimit = CpLimit.getNumOccurrences() > 0;
+#endif
+
+  for (auto I : Copies) {
+#ifndef NDEBUG
+    if (HasLimit && CpCount >= CpLimit)
+      break;
+#endif
+    if (Deleted.count(I))
+      continue;
+    auto SA = DFG.addr<InstrNode*>(I);
+    NodeList Ds = SA.Addr->members_if(DFG.IsDef, DFG);
+    if (Ds.size() != 1)
+      continue;
+    NodeAddr<DefNode*> DA = Ds[0];
+    RegisterRef DR0 = DA.Addr->getRegRef();
+    NodeList Us = SA.Addr->members_if(DFG.IsUse, DFG);
+    if (Us.size() != 1)
+      continue;
+    NodeAddr<UseNode*> UA0 = Us[0];
+    RegisterRef UR0 = UA0.Addr->getRegRef();
+    NodeId RD0 = UA0.Addr->getReachingDef();
+
+    for (NodeId N = DA.Addr->getReachedUse(), NextN; N; N = NextN) {
+      auto UA = DFG.addr<UseNode*>(N);
+      NextN = UA.Addr->getSibling();
+      uint16_t F = UA.Addr->getFlags();
+      if ((F & NodeAttrs::PhiRef) || (F & NodeAttrs::Fixed))
+        continue;
+      if (UA.Addr->getRegRef() != DR0)
+        continue;
+      NodeAddr<InstrNode*> IA = UA.Addr->getOwner(DFG);
+      assert(DFG.IsCode<NodeAttrs::Stmt>(IA));
+      MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode();
+      if (RDefMap[UR0][IA.Id] != RD0)
+        continue;
+      MachineOperand &Op = UA.Addr->getOp();
+      if (Op.isTied())
+        continue;
+      if (trace()) {
+        dbgs() << "can replace " << Print<RegisterRef>(DR0, DFG)
+               << " with " << Print<RegisterRef>(UR0, DFG) << " in "
+               << *NodeAddr<StmtNode*>(IA).Addr->getCode();
+      }
+
+      Op.setReg(UR0.Reg);
+      Op.setSubReg(UR0.Sub);
+      Changed = true;
+#ifndef NDEBUG
+      if (HasLimit && CpCount >= CpLimit)
+        break;
+      CpCount++;
+#endif
+
+      if (MI->isCopy()) {
+        MachineOperand &Op0 = MI->getOperand(0), &Op1 = MI->getOperand(1);
+        if (Op0.getReg() == Op1.getReg() && Op0.getSubReg() == Op1.getSubReg())
+          MI->eraseFromParent();
+        Deleted.insert(IA.Id);
+      }
+    }
+  }
+
+  return Changed;
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFCopy.h b/contrib/llvm/lib/Target/Hexagon/RDFCopy.h
new file mode 100644
index 000000000000..02531b94c9b0
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFCopy.h
@@ -0,0 +1,48 @@
+//===--- RDFCopy.h --------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef RDF_COPY_H
+#define RDF_COPY_H
+
+#include "RDFGraph.h"
+#include <map>
+#include <vector>
+
+namespace llvm {
+  class MachineBasicBlock;
+  class MachineDominatorTree;
+  class MachineInstr;
+}
+
+namespace rdf {
+  struct CopyPropagation {
+    CopyPropagation(DataFlowGraph &dfg) : MDT(dfg.getDT()), DFG(dfg),
+        Trace(false) {}
+
+    bool run();
+    void trace(bool On) { Trace = On; }
+    bool trace() const { return Trace; }
+
+  private:
+    const MachineDominatorTree &MDT;
+    DataFlowGraph &DFG;
+    DataFlowGraph::DefStackMap DefM;
+    bool Trace;
+
+    // map: register -> (map: stmt -> reaching def)
+    std::map<RegisterRef,std::map<NodeId,NodeId>> RDefMap;
+    std::vector<NodeId> Copies;
+
+    void recordCopy(NodeAddr<StmtNode*> SA, MachineInstr *MI);
+    void updateMap(NodeAddr<InstrNode*> IA);
+    bool scanBlock(MachineBasicBlock *B);
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
new file mode 100644
index 000000000000..95668577bd50
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
@@ -0,0 +1,204 @@
+//===--- RDFDeadCode.cpp --------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// RDF-based generic dead code elimination.
+
+#include "RDFGraph.h"
+#include "RDFLiveness.h"
+#include "RDFDeadCode.h"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+using namespace rdf;
+
+// Check if the given instruction has observable side-effects, i.e. if
+// it should be considered "live". It is safe for this function to be
+// overly conservative (i.e. return "true" for all instructions), but it
+// is not safe to return "false" for an instruction that should not be
+// considered removable.
+bool DeadCodeElimination::isLiveInstr(const MachineInstr *MI) const {
+  if (MI->mayStore() || MI->isBranch() || MI->isCall() || MI->isReturn())
+    return true;
+  if (MI->hasOrderedMemoryRef() || MI->hasUnmodeledSideEffects())
+    return true;
+  if (MI->isPHI())
+    return false;
+  for (auto &Op : MI->operands())
+    if (Op.isReg() && MRI.isReserved(Op.getReg()))
+      return true;
+  return false;
+}
+
+void DeadCodeElimination::scanInstr(NodeAddr<InstrNode*> IA,
+      SetVector<NodeId> &WorkQ) {
+  if (!DFG.IsCode<NodeAttrs::Stmt>(IA))
+    return;
+  if (!isLiveInstr(NodeAddr<StmtNode*>(IA).Addr->getCode()))
+    return;
+  for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG)) {
+    if (!LiveNodes.count(RA.Id))
+      WorkQ.insert(RA.Id);
+  }
+}
+
+void DeadCodeElimination::processDef(NodeAddr<DefNode*> DA,
+      SetVector<NodeId> &WorkQ) {
+  NodeAddr<InstrNode*> IA = DA.Addr->getOwner(DFG);
+  for (NodeAddr<UseNode*> UA : IA.Addr->members_if(DFG.IsUse, DFG)) {
+    if (!LiveNodes.count(UA.Id))
+      WorkQ.insert(UA.Id);
+  }
+  for (NodeAddr<DefNode*> TA : DFG.getRelatedRefs(IA, DA))
+    LiveNodes.insert(TA.Id);
+}
+
+void DeadCodeElimination::processUse(NodeAddr<UseNode*> UA,
+      SetVector<NodeId> &WorkQ) {
+  for (NodeAddr<DefNode*> DA : LV.getAllReachingDefs(UA)) {
+    if (!LiveNodes.count(DA.Id))
+      WorkQ.insert(DA.Id);
+  }
+}
+
+// Traverse the DFG and collect the set dead RefNodes and the set of
+// dead instructions. Return "true" if any of these sets is non-empty,
+// "false" otherwise.
+bool DeadCodeElimination::collect() {
+  // This function works by first finding all live nodes. The dead nodes
+  // are then the complement of the set of live nodes.
+  //
+  // Assume that all nodes are dead. Identify instructions which must be
+  // considered live, i.e. instructions with observable side-effects, such
+  // as calls and stores. All arguments of such instructions are considered
+  // live. For each live def, all operands used in the corresponding
+  // instruction are considered live. For each live use, all its reaching
+  // defs are considered live.
+  LiveNodes.clear();
+  SetVector<NodeId> WorkQ;
+  for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG))
+    for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG))
+      scanInstr(IA, WorkQ);
+
+  while (!WorkQ.empty()) {
+    NodeId N = *WorkQ.begin();
+    WorkQ.remove(N);
+    LiveNodes.insert(N);
+    auto RA = DFG.addr<RefNode*>(N);
+    if (DFG.IsDef(RA))
+      processDef(RA, WorkQ);
+    else
+      processUse(RA, WorkQ);
+  }
+
+  if (trace()) {
+    dbgs() << "Live nodes:\n";
+    for (NodeId N : LiveNodes) {
+      auto RA = DFG.addr<RefNode*>(N);
+      dbgs() << PrintNode<RefNode*>(RA, DFG) << "\n";
+    }
+  }
+
+  auto IsDead = [this] (NodeAddr<InstrNode*> IA) -> bool {
+    for (NodeAddr<DefNode*> DA : IA.Addr->members_if(DFG.IsDef, DFG))
+      if (LiveNodes.count(DA.Id))
+        return false;
+    return true;
+  };
+
+  for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) {
+    for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) {
+      for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG))
+        if (!LiveNodes.count(RA.Id))
+          DeadNodes.insert(RA.Id);
+      if (DFG.IsCode<NodeAttrs::Stmt>(IA))
+        if (isLiveInstr(NodeAddr<StmtNode*>(IA).Addr->getCode()))
+          continue;
+      if (IsDead(IA)) {
+        DeadInstrs.insert(IA.Id);
+        if (trace())
+          dbgs() << "Dead instr: " << PrintNode<InstrNode*>(IA, DFG) << "\n";
+      }
+    }
+  }
+
+  return !DeadNodes.empty();
+}
+
+// Erase the nodes given in the Nodes set from DFG. In addition to removing
+// them from the DFG, if a node corresponds to a statement, the corresponding
+// machine instruction is erased from the function.
+bool DeadCodeElimination::erase(const SetVector<NodeId> &Nodes) {
+  if (Nodes.empty())
+    return false;
+
+  // Prepare the actual set of ref nodes to remove: ref nodes from Nodes
+  // are included directly, for each InstrNode in Nodes, include the set
+  // of all RefNodes from it.
+  NodeList DRNs, DINs;
+  for (auto I : Nodes) {
+    auto BA = DFG.addr<NodeBase*>(I);
+    uint16_t Type = BA.Addr->getType();
+    if (Type == NodeAttrs::Ref) {
+      DRNs.push_back(DFG.addr<RefNode*>(I));
+      continue;
+    }
+
+    // If it's a code node, add all ref nodes from it.
+    uint16_t Kind = BA.Addr->getKind();
+    if (Kind == NodeAttrs::Stmt || Kind == NodeAttrs::Phi) {
+      for (auto N : NodeAddr<CodeNode*>(BA).Addr->members(DFG))
+        DRNs.push_back(N);
+      DINs.push_back(DFG.addr<InstrNode*>(I));
+    } else {
+      llvm_unreachable("Unexpected code node");
+      return false;
+    }
+  }
+
+  // Sort the list so that use nodes are removed first. This makes the
+  // "unlink" functions a bit faster.
+  auto UsesFirst = [] (NodeAddr<RefNode*> A, NodeAddr<RefNode*> B) -> bool {
+    uint16_t KindA = A.Addr->getKind(), KindB = B.Addr->getKind();
+    if (KindA == NodeAttrs::Use && KindB == NodeAttrs::Def)
+      return true;
+    if (KindA == NodeAttrs::Def && KindB == NodeAttrs::Use)
+      return false;
+    return A.Id < B.Id;
+  };
+  std::sort(DRNs.begin(), DRNs.end(), UsesFirst);
+
+  if (trace())
+    dbgs() << "Removing dead ref nodes:\n";
+  for (NodeAddr<RefNode*> RA : DRNs) {
+    if (trace())
+      dbgs() << "  " << PrintNode<RefNode*>(RA, DFG) << '\n';
+    if (DFG.IsUse(RA))
+      DFG.unlinkUse(RA);
+    else if (DFG.IsDef(RA))
+      DFG.unlinkDef(RA);
+  }
+
+  // Now, remove all dead instruction nodes.
+  for (NodeAddr<InstrNode*> IA : DINs) {
+    NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG);
+    BA.Addr->removeMember(IA, DFG);
+    if (!DFG.IsCode<NodeAttrs::Stmt>(IA))
+      continue;
+
+    MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode();
+    if (trace())
+      dbgs() << "erasing: " << *MI;
+    MI->eraseFromParent();
+  }
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.h b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.h
new file mode 100644
index 000000000000..f4373fb5007d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.h
@@ -0,0 +1,65 @@
+//===--- RDFDeadCode.h ----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// RDF-based generic dead code elimination.
+//
+// The main interface of this class are functions "collect" and "erase".
+// This allows custom processing of the function being optimized by a
+// particular consumer. The simplest way to use this class would be to
+// instantiate an object, and then simply call "collect" and "erase",
+// passing the result of "getDeadInstrs()" to it.
+// A more complex scenario would be to call "collect" first, then visit
+// all post-increment instructions to see if the address update is dead
+// or not, and if it is, convert the instruction to a non-updating form.
+// After that "erase" can be called with the set of nodes including both,
+// dead defs from the updating instructions and the nodes corresponding
+// to the dead instructions.
+
+#ifndef RDF_DEADCODE_H
+#define RDF_DEADCODE_H
+
+#include "RDFGraph.h"
+#include "RDFLiveness.h"
+#include "llvm/ADT/SetVector.h"
+
+namespace llvm {
+  class MachineRegisterInfo;
+}
+
+namespace rdf {
+  struct DeadCodeElimination {
+    DeadCodeElimination(DataFlowGraph &dfg, MachineRegisterInfo &mri)
+      : Trace(false), DFG(dfg), MRI(mri), LV(mri, dfg) {}
+
+    bool collect();
+    bool erase(const SetVector<NodeId> &Nodes);
+    void trace(bool On) { Trace = On; }
+    bool trace() const { return Trace; }
+
+    SetVector<NodeId> getDeadNodes() { return DeadNodes; }
+    SetVector<NodeId> getDeadInstrs() { return DeadInstrs; }
+    DataFlowGraph &getDFG() { return DFG; }
+
+  private:
+    bool Trace;
+    SetVector<NodeId> LiveNodes;
+    SetVector<NodeId> DeadNodes;
+    SetVector<NodeId> DeadInstrs;
+    DataFlowGraph &DFG;
+    MachineRegisterInfo &MRI;
+    Liveness LV;
+
+    bool isLiveInstr(const MachineInstr *MI) const;
+    void scanInstr(NodeAddr<InstrNode*> IA, SetVector<NodeId> &WorkQ);
+    void processDef(NodeAddr<DefNode*> DA, SetVector<NodeId> &WorkQ);
+    void processUse(NodeAddr<UseNode*> UA, SetVector<NodeId> &WorkQ);
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp b/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp
new file mode 100644
index 000000000000..9b47422153bb
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp
@@ -0,0 +1,1716 @@
+//===--- RDFGraph.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Target-independent, SSA-based data flow graph for register data flow (RDF).
+//
+#include "RDFGraph.h"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+using namespace rdf;
+
+// Printing functions. Have them here first, so that the rest of the code
+// can use them.
+namespace rdf {
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterRef> &P) {
+  auto &TRI = P.G.getTRI();
+  if (P.Obj.Reg > 0 && P.Obj.Reg < TRI.getNumRegs())
+    OS << TRI.getName(P.Obj.Reg);
+  else
+    OS << '#' << P.Obj.Reg;
+  if (P.Obj.Sub > 0) {
+    OS << ':';
+    if (P.Obj.Sub < TRI.getNumSubRegIndices())
+      OS << TRI.getSubRegIndexName(P.Obj.Sub);
+    else
+      OS << '#' << P.Obj.Sub;
+  }
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeId> &P) {
+  auto NA = P.G.addr<NodeBase*>(P.Obj);
+  uint16_t Attrs = NA.Addr->getAttrs();
+  uint16_t Kind = NodeAttrs::kind(Attrs);
+  uint16_t Flags = NodeAttrs::flags(Attrs);
+  switch (NodeAttrs::type(Attrs)) {
+    case NodeAttrs::Code:
+      switch (Kind) {
+        case NodeAttrs::Func:   OS << 'f'; break;
+        case NodeAttrs::Block:  OS << 'b'; break;
+        case NodeAttrs::Stmt:   OS << 's'; break;
+        case NodeAttrs::Phi:    OS << 'p'; break;
+        default:                OS << "c?"; break;
+      }
+      break;
+    case NodeAttrs::Ref:
+      if (Flags & NodeAttrs::Preserving)
+        OS << '+';
+      if (Flags & NodeAttrs::Clobbering)
+        OS << '~';
+      switch (Kind) {
+        case NodeAttrs::Use:    OS << 'u'; break;
+        case NodeAttrs::Def:    OS << 'd'; break;
+        case NodeAttrs::Block:  OS << 'b'; break;
+        default:                OS << "r?"; break;
+      }
+      break;
+    default:
+      OS << '?';
+      break;
+  }
+  OS << P.Obj;
+  if (Flags & NodeAttrs::Shadow)
+    OS << '"';
+  return OS;
+}
+
+namespace {
+  void printRefHeader(raw_ostream &OS, const NodeAddr<RefNode*> RA,
+        const DataFlowGraph &G) {
+    OS << Print<NodeId>(RA.Id, G) << '<'
+       << Print<RegisterRef>(RA.Addr->getRegRef(), G) << '>';
+    if (RA.Addr->getFlags() & NodeAttrs::Fixed)
+      OS << '!';
+  }
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<DefNode*>> &P) {
+  printRefHeader(OS, P.Obj, P.G);
+  OS << '(';
+  if (NodeId N = P.Obj.Addr->getReachingDef())
+    OS << Print<NodeId>(N, P.G);
+  OS << ',';
+  if (NodeId N = P.Obj.Addr->getReachedDef())
+    OS << Print<NodeId>(N, P.G);
+  OS << ',';
+  if (NodeId N = P.Obj.Addr->getReachedUse())
+    OS << Print<NodeId>(N, P.G);
+  OS << "):";
+  if (NodeId N = P.Obj.Addr->getSibling())
+    OS << Print<NodeId>(N, P.G);
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<UseNode*>> &P) {
+  printRefHeader(OS, P.Obj, P.G);
+  OS << '(';
+  if (NodeId N = P.Obj.Addr->getReachingDef())
+    OS << Print<NodeId>(N, P.G);
+  OS << "):";
+  if (NodeId N = P.Obj.Addr->getSibling())
+    OS << Print<NodeId>(N, P.G);
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+      const Print<NodeAddr<PhiUseNode*>> &P) {
+  printRefHeader(OS, P.Obj, P.G);
+  OS << '(';
+  if (NodeId N = P.Obj.Addr->getReachingDef())
+    OS << Print<NodeId>(N, P.G);
+  OS << ',';
+  if (NodeId N = P.Obj.Addr->getPredecessor())
+    OS << Print<NodeId>(N, P.G);
+  OS << "):";
+  if (NodeId N = P.Obj.Addr->getSibling())
+    OS << Print<NodeId>(N, P.G);
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<RefNode*>> &P) {
+  switch (P.Obj.Addr->getKind()) {
+    case NodeAttrs::Def:
+      OS << PrintNode<DefNode*>(P.Obj, P.G);
+      break;
+    case NodeAttrs::Use:
+      if (P.Obj.Addr->getFlags() & NodeAttrs::PhiRef)
+        OS << PrintNode<PhiUseNode*>(P.Obj, P.G);
+      else
+        OS << PrintNode<UseNode*>(P.Obj, P.G);
+      break;
+  }
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeList> &P) {
+  unsigned N = P.Obj.size();
+  for (auto I : P.Obj) {
+    OS << Print<NodeId>(I.Id, P.G);
+    if (--N)
+      OS << ' ';
+  }
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeSet> &P) {
+  unsigned N = P.Obj.size();
+  for (auto I : P.Obj) {
+    OS << Print<NodeId>(I, P.G);
+    if (--N)
+      OS << ' ';
+  }
+  return OS;
+}
+
+namespace {
+  template <typename T>
+  struct PrintListV {
+    PrintListV(const NodeList &L, const DataFlowGraph &G) : List(L), G(G) {}
+    typedef T Type;
+    const NodeList &List;
+    const DataFlowGraph &G;
+  };
+
+  template <typename T>
+  raw_ostream &operator<< (raw_ostream &OS, const PrintListV<T> &P) {
+    unsigned N = P.List.size();
+    for (NodeAddr<T> A : P.List) {
+      OS << PrintNode<T>(A, P.G);
+      if (--N)
+        OS << ", ";
+    }
+    return OS;
+  }
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<PhiNode*>> &P) {
+  OS << Print<NodeId>(P.Obj.Id, P.G) << ": phi ["
+     << PrintListV<RefNode*>(P.Obj.Addr->members(P.G), P.G) << ']';
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+      const Print<NodeAddr<StmtNode*>> &P) {
+  unsigned Opc = P.Obj.Addr->getCode()->getOpcode();
+  OS << Print<NodeId>(P.Obj.Id, P.G) << ": " << P.G.getTII().getName(Opc)
+     << " [" << PrintListV<RefNode*>(P.Obj.Addr->members(P.G), P.G) << ']';
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+      const Print<NodeAddr<InstrNode*>> &P) {
+  switch (P.Obj.Addr->getKind()) {
+    case NodeAttrs::Phi:
+      OS << PrintNode<PhiNode*>(P.Obj, P.G);
+      break;
+    case NodeAttrs::Stmt:
+      OS << PrintNode<StmtNode*>(P.Obj, P.G);
+      break;
+    default:
+      OS << "instr? " << Print<NodeId>(P.Obj.Id, P.G);
+      break;
+  }
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+      const Print<NodeAddr<BlockNode*>> &P) {
+  auto *BB = P.Obj.Addr->getCode();
+  unsigned NP = BB->pred_size();
+  std::vector<int> Ns;
+  auto PrintBBs = [&OS,&P] (std::vector<int> Ns) -> void {
+    unsigned N = Ns.size();
+    for (auto I : Ns) {
+      OS << "BB#" << I;
+      if (--N)
+        OS << ", ";
+    }
+  };
+
+  OS << Print<NodeId>(P.Obj.Id, P.G) << ": === BB#" << BB->getNumber()
+     << " === preds(" << NP << "): ";
+  for (auto I : BB->predecessors())
+    Ns.push_back(I->getNumber());
+  PrintBBs(Ns);
+
+  unsigned NS = BB->succ_size();
+  OS << "  succs(" << NS << "): ";
+  Ns.clear();
+  for (auto I : BB->successors())
+    Ns.push_back(I->getNumber());
+  PrintBBs(Ns);
+  OS << '\n';
+
+  for (auto I : P.Obj.Addr->members(P.G))
+    OS << PrintNode<InstrNode*>(I, P.G) << '\n';
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+      const Print<NodeAddr<FuncNode*>> &P) {
+  OS << "DFG dump:[\n" << Print<NodeId>(P.Obj.Id, P.G) << ": Function: "
+     << P.Obj.Addr->getCode()->getName() << '\n';
+  for (auto I : P.Obj.Addr->members(P.G))
+    OS << PrintNode<BlockNode*>(I, P.G) << '\n';
+  OS << "]\n";
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterSet> &P) {
+  OS << '{';
+  for (auto I : P.Obj)
+    OS << ' ' << Print<RegisterRef>(I, P.G);
+  OS << " }";
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+      const Print<DataFlowGraph::DefStack> &P) {
+  for (auto I = P.Obj.top(), E = P.Obj.bottom(); I != E; ) {
+    OS << Print<NodeId>(I->Id, P.G)
+       << '<' << Print<RegisterRef>(I->Addr->getRegRef(), P.G) << '>';
+    I.down();
+    if (I != E)
+      OS << ' ';
+  }
+  return OS;
+}
+
+} // namespace rdf
+
+// Node allocation functions.
+//
+// Node allocator is like a slab memory allocator: it allocates blocks of
+// memory in sizes that are multiples of the size of a node. Each block has
+// the same size. Nodes are allocated from the currently active block, and
+// when it becomes full, a new one is created.
+// There is a mapping scheme between node id and its location in a block,
+// and within that block is described in the header file.
+//
+void NodeAllocator::startNewBlock() {
+  void *T = MemPool.Allocate(NodesPerBlock*NodeMemSize, NodeMemSize);
+  char *P = static_cast<char*>(T);
+  Blocks.push_back(P);
+  // Check if the block index is still within the allowed range, i.e. less
+  // than 2^N, where N is the number of bits in NodeId for the block index.
+  // BitsPerIndex is the number of bits per node index.
+  assert((Blocks.size() < (1U << (8*sizeof(NodeId)-BitsPerIndex))) &&
+         "Out of bits for block index");
+  ActiveEnd = P;
+}
+
+bool NodeAllocator::needNewBlock() {
+  if (Blocks.empty())
+    return true;
+
+  char *ActiveBegin = Blocks.back();
+  uint32_t Index = (ActiveEnd-ActiveBegin)/NodeMemSize;
+  return Index >= NodesPerBlock;
+}
+
+NodeAddr<NodeBase*> NodeAllocator::New() {
+  if (needNewBlock())
+    startNewBlock();
+
+  uint32_t ActiveB = Blocks.size()-1;
+  uint32_t Index = (ActiveEnd - Blocks[ActiveB])/NodeMemSize;
+  NodeAddr<NodeBase*> NA = { reinterpret_cast<NodeBase*>(ActiveEnd),
+                             makeId(ActiveB, Index) };
+  ActiveEnd += NodeMemSize;
+  return NA;
+}
+
+NodeId NodeAllocator::id(const NodeBase *P) const {
+  uintptr_t A = reinterpret_cast<uintptr_t>(P);
+  for (unsigned i = 0, n = Blocks.size(); i != n; ++i) {
+    uintptr_t B = reinterpret_cast<uintptr_t>(Blocks[i]);
+    if (A < B || A >= B + NodesPerBlock*NodeMemSize)
+      continue;
+    uint32_t Idx = (A-B)/NodeMemSize;
+    return makeId(i, Idx);
+  }
+  llvm_unreachable("Invalid node address");
+}
+
+void NodeAllocator::clear() {
+  MemPool.Reset();
+  Blocks.clear();
+  ActiveEnd = nullptr;
+}
+
+
+// Insert node NA after "this" in the circular chain.
+void NodeBase::append(NodeAddr<NodeBase*> NA) {
+  NodeId Nx = Next;
+  // If NA is already "next", do nothing.
+  if (Next != NA.Id) {
+    Next = NA.Id;
+    NA.Addr->Next = Nx;
+  }
+}
+
+
+// Fundamental node manipulator functions.
+
+// Obtain the register reference from a reference node.
+RegisterRef RefNode::getRegRef() const {
+  assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref);
+  if (NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef)
+    return Ref.RR;
+  assert(Ref.Op != nullptr);
+  return { Ref.Op->getReg(), Ref.Op->getSubReg() };
+}
+
+// Set the register reference in the reference node directly (for references
+// in phi nodes).
+void RefNode::setRegRef(RegisterRef RR) {
+  assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref);
+  assert(NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef);
+  Ref.RR = RR;
+}
+
+// Set the register reference in the reference node based on a machine
+// operand (for references in statement nodes).
+void RefNode::setRegRef(MachineOperand *Op) {
+  assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref);
+  assert(!(NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef));
+  Ref.Op = Op;
+}
+
+// Get the owner of a given reference node.
+NodeAddr<NodeBase*> RefNode::getOwner(const DataFlowGraph &G) {
+  NodeAddr<NodeBase*> NA = G.addr<NodeBase*>(getNext());
+
+  while (NA.Addr != this) {
+    if (NA.Addr->getType() == NodeAttrs::Code)
+      return NA;
+    NA = G.addr<NodeBase*>(NA.Addr->getNext());
+  }
+  llvm_unreachable("No owner in circular list");
+}
+
+// Connect the def node to the reaching def node.
+void DefNode::linkToDef(NodeId Self, NodeAddr<DefNode*> DA) {
+  Ref.RD = DA.Id;
+  Ref.Sib = DA.Addr->getReachedDef();
+  DA.Addr->setReachedDef(Self);
+}
+
+// Connect the use node to the reaching def node.
+void UseNode::linkToDef(NodeId Self, NodeAddr<DefNode*> DA) {
+  Ref.RD = DA.Id;
+  Ref.Sib = DA.Addr->getReachedUse();
+  DA.Addr->setReachedUse(Self);
+}
+
+// Get the first member of the code node.
+NodeAddr<NodeBase*> CodeNode::getFirstMember(const DataFlowGraph &G) const {
+  if (Code.FirstM == 0)
+    return NodeAddr<NodeBase*>();
+  return G.addr<NodeBase*>(Code.FirstM);
+}
+
+// Get the last member of the code node.
+NodeAddr<NodeBase*> CodeNode::getLastMember(const DataFlowGraph &G) const {
+  if (Code.LastM == 0)
+    return NodeAddr<NodeBase*>();
+  return G.addr<NodeBase*>(Code.LastM);
+}
+
+// Add node NA at the end of the member list of the given code node.
+void CodeNode::addMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G) {
+  auto ML = getLastMember(G);
+  if (ML.Id != 0) {
+    ML.Addr->append(NA);
+  } else {
+    Code.FirstM = NA.Id;
+    NodeId Self = G.id(this);
+    NA.Addr->setNext(Self);
+  }
+  Code.LastM = NA.Id;
+}
+
+// Add node NA after member node MA in the given code node.
+void CodeNode::addMemberAfter(NodeAddr<NodeBase*> MA, NodeAddr<NodeBase*> NA,
+      const DataFlowGraph &G) {
+  MA.Addr->append(NA);
+  if (Code.LastM == MA.Id)
+    Code.LastM = NA.Id;
+}
+
+// Remove member node NA from the given code node.
+void CodeNode::removeMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G) {
+  auto MA = getFirstMember(G);
+  assert(MA.Id != 0);
+
+  // Special handling if the member to remove is the first member.
+  if (MA.Id == NA.Id) {
+    if (Code.LastM == MA.Id) {
+      // If it is the only member, set both first and last to 0.
+      Code.FirstM = Code.LastM = 0;
+    } else {
+      // Otherwise, advance the first member.
+      Code.FirstM = MA.Addr->getNext();
+    }
+    return;
+  }
+
+  while (MA.Addr != this) {
+    NodeId MX = MA.Addr->getNext();
+    if (MX == NA.Id) {
+      MA.Addr->setNext(NA.Addr->getNext());
+      // If the member to remove happens to be the last one, update the
+      // LastM indicator.
+      if (Code.LastM == NA.Id)
+        Code.LastM = MA.Id;
+      return;
+    }
+    MA = G.addr<NodeBase*>(MX);
+  }
+  llvm_unreachable("No such member");
+}
+
+// Return the list of all members of the code node.
+NodeList CodeNode::members(const DataFlowGraph &G) const {
+  static auto True = [] (NodeAddr<NodeBase*>) -> bool { return true; };
+  return members_if(True, G);
+}
+
+// Return the owner of the given instr node.
+NodeAddr<NodeBase*> InstrNode::getOwner(const DataFlowGraph &G) {
+  NodeAddr<NodeBase*> NA = G.addr<NodeBase*>(getNext());
+
+  while (NA.Addr != this) {
+    assert(NA.Addr->getType() == NodeAttrs::Code);
+    if (NA.Addr->getKind() == NodeAttrs::Block)
+      return NA;
+    NA = G.addr<NodeBase*>(NA.Addr->getNext());
+  }
+  llvm_unreachable("No owner in circular list");
+}
+
+// Add the phi node PA to the given block node.
+void BlockNode::addPhi(NodeAddr<PhiNode*> PA, const DataFlowGraph &G) {
+  auto M = getFirstMember(G);
+  if (M.Id == 0) {
+    addMember(PA, G);
+    return;
+  }
+
+  assert(M.Addr->getType() == NodeAttrs::Code);
+  if (M.Addr->getKind() == NodeAttrs::Stmt) {
+    // If the first member of the block is a statement, insert the phi as
+    // the first member.
+    Code.FirstM = PA.Id;
+    PA.Addr->setNext(M.Id);
+  } else {
+    // If the first member is a phi, find the last phi, and append PA to it.
+    assert(M.Addr->getKind() == NodeAttrs::Phi);
+    NodeAddr<NodeBase*> MN = M;
+    do {
+      M = MN;
+      MN = G.addr<NodeBase*>(M.Addr->getNext());
+      assert(MN.Addr->getType() == NodeAttrs::Code);
+    } while (MN.Addr->getKind() == NodeAttrs::Phi);
+
+    // M is the last phi.
+    addMemberAfter(M, PA, G);
+  }
+}
+
+// Find the block node corresponding to the machine basic block BB in the
+// given func node.
+NodeAddr<BlockNode*> FuncNode::findBlock(const MachineBasicBlock *BB,
+      const DataFlowGraph &G) const {
+  auto EqBB = [BB] (NodeAddr<NodeBase*> NA) -> bool {
+    return NodeAddr<BlockNode*>(NA).Addr->getCode() == BB;
+  };
+  NodeList Ms = members_if(EqBB, G);
+  if (!Ms.empty())
+    return Ms[0];
+  return NodeAddr<BlockNode*>();
+}
+
+// Get the block node for the entry block in the given function.
+NodeAddr<BlockNode*> FuncNode::getEntryBlock(const DataFlowGraph &G) {
+  MachineBasicBlock *EntryB = &getCode()->front();
+  return findBlock(EntryB, G);
+}
+
+
+// Register aliasing information.
+//
+// In theory, the lane information could be used to determine register
+// covering (and aliasing), but depending on the sub-register structure,
+// the lane mask information may be missing. The covering information
+// must be available for this framework to work, so relying solely on
+// the lane data is not sufficient.
+
+// Determine whether RA covers RB.
+bool RegisterAliasInfo::covers(RegisterRef RA, RegisterRef RB) const {
+  if (RA == RB)
+    return true;
+  if (TargetRegisterInfo::isVirtualRegister(RA.Reg)) {
+    assert(TargetRegisterInfo::isVirtualRegister(RB.Reg));
+    if (RA.Reg != RB.Reg)
+      return false;
+    if (RA.Sub == 0)
+      return true;
+    return TRI.composeSubRegIndices(RA.Sub, RB.Sub) == RA.Sub;
+  }
+
+  assert(TargetRegisterInfo::isPhysicalRegister(RA.Reg) &&
+         TargetRegisterInfo::isPhysicalRegister(RB.Reg));
+  unsigned A = RA.Sub != 0 ? TRI.getSubReg(RA.Reg, RA.Sub) : RA.Reg;
+  unsigned B = RB.Sub != 0 ? TRI.getSubReg(RB.Reg, RB.Sub) : RB.Reg;
+  return TRI.isSubRegister(A, B);
+}
+
+// Determine whether RR is covered by the set of references RRs.
+bool RegisterAliasInfo::covers(const RegisterSet &RRs, RegisterRef RR) const {
+  if (RRs.count(RR))
+    return true;
+
+  // For virtual registers, we cannot accurately determine covering based
+  // on subregisters. If RR itself is not present in RRs, but it has a sub-
+  // register reference, check for the super-register alone. Otherwise,
+  // assume non-covering.
+  if (TargetRegisterInfo::isVirtualRegister(RR.Reg)) {
+    if (RR.Sub != 0)
+      return RRs.count({RR.Reg, 0});
+    return false;
+  }
+
+  // If any super-register of RR is present, then RR is covered.
+  unsigned Reg = RR.Sub == 0 ? RR.Reg : TRI.getSubReg(RR.Reg, RR.Sub);
+  for (MCSuperRegIterator SR(Reg, &TRI); SR.isValid(); ++SR)
+    if (RRs.count({*SR, 0}))
+      return true;
+
+  return false;
+}
+
+// Get the list of references aliased to RR.
+std::vector<RegisterRef> RegisterAliasInfo::getAliasSet(RegisterRef RR) const {
+  // Do not include RR in the alias set. For virtual registers return an
+  // empty set.
+  std::vector<RegisterRef> AS;
+  if (TargetRegisterInfo::isVirtualRegister(RR.Reg))
+    return AS;
+  assert(TargetRegisterInfo::isPhysicalRegister(RR.Reg));
+  unsigned R = RR.Reg;
+  if (RR.Sub)
+    R = TRI.getSubReg(RR.Reg, RR.Sub);
+
+  for (MCRegAliasIterator AI(R, &TRI, false); AI.isValid(); ++AI)
+    AS.push_back(RegisterRef({*AI, 0}));
+  return AS;
+}
+
+// Check whether RA and RB are aliased.
+bool RegisterAliasInfo::alias(RegisterRef RA, RegisterRef RB) const {
+  bool VirtA = TargetRegisterInfo::isVirtualRegister(RA.Reg);
+  bool VirtB = TargetRegisterInfo::isVirtualRegister(RB.Reg);
+  bool PhysA = TargetRegisterInfo::isPhysicalRegister(RA.Reg);
+  bool PhysB = TargetRegisterInfo::isPhysicalRegister(RB.Reg);
+
+  if (VirtA != VirtB)
+    return false;
+
+  if (VirtA) {
+    if (RA.Reg != RB.Reg)
+      return false;
+    // RA and RB refer to the same register. If any of them refer to the
+    // whole register, they must be aliased.
+    if (RA.Sub == 0 || RB.Sub == 0)
+      return true;
+    unsigned SA = TRI.getSubRegIdxSize(RA.Sub);
+    unsigned OA = TRI.getSubRegIdxOffset(RA.Sub);
+    unsigned SB = TRI.getSubRegIdxSize(RB.Sub);
+    unsigned OB = TRI.getSubRegIdxOffset(RB.Sub);
+    if (OA <= OB && OA+SA > OB)
+      return true;
+    if (OB <= OA && OB+SB > OA)
+      return true;
+    return false;
+  }
+
+  assert(PhysA && PhysB);
+  (void)PhysA, (void)PhysB;
+  unsigned A = RA.Sub ? TRI.getSubReg(RA.Reg, RA.Sub) : RA.Reg;
+  unsigned B = RB.Sub ? TRI.getSubReg(RB.Reg, RB.Sub) : RB.Reg;
+  for (MCRegAliasIterator I(A, &TRI, true); I.isValid(); ++I)
+    if (B == *I)
+      return true;
+  return false;
+}
+
+
+// Target operand information.
+//
+
+// For a given instruction, check if there are any bits of RR that can remain
+// unchanged across this def.
+bool TargetOperandInfo::isPreserving(const MachineInstr &In, unsigned OpNum)
+      const {
+  return TII.isPredicated(&In);
+}
+
+// Check if the definition of RR produces an unspecified value.
+bool TargetOperandInfo::isClobbering(const MachineInstr &In, unsigned OpNum)
+      const {
+  if (In.isCall())
+    if (In.getOperand(OpNum).isImplicit())
+      return true;
+  return false;
+}
+
+// Check if the given instruction specifically requires 
+bool TargetOperandInfo::isFixedReg(const MachineInstr &In, unsigned OpNum)
+      const {
+  if (In.isCall() || In.isReturn())
+    return true;
+  const MCInstrDesc &D = In.getDesc();
+  if (!D.getImplicitDefs() && !D.getImplicitUses())
+    return false;
+  const MachineOperand &Op = In.getOperand(OpNum);
+  // If there is a sub-register, treat the operand as non-fixed. Currently,
+  // fixed registers are those that are listed in the descriptor as implicit
+  // uses or defs, and those lists do not allow sub-registers.
+  if (Op.getSubReg() != 0)
+    return false;
+  unsigned Reg = Op.getReg();
+  const MCPhysReg *ImpR = Op.isDef() ? D.getImplicitDefs()
+                                     : D.getImplicitUses();
+  if (!ImpR)
+    return false;
+  while (*ImpR)
+    if (*ImpR++ == Reg)
+      return true;
+  return false;
+}
+
+
+//
+// The data flow graph construction.
+//
+
+DataFlowGraph::DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii,
+      const TargetRegisterInfo &tri, const MachineDominatorTree &mdt,
+      const MachineDominanceFrontier &mdf, const RegisterAliasInfo &rai,
+      const TargetOperandInfo &toi)
+    : TimeG("rdf"), MF(mf), TII(tii), TRI(tri), MDT(mdt), MDF(mdf), RAI(rai),
+      TOI(toi) {
+}
+
+
+// The implementation of the definition stack.
+// Each register reference has its own definition stack. In particular,
+// for a register references "Reg" and "Reg:subreg" will each have their
+// own definition stacks.
+
+// Construct a stack iterator.
+DataFlowGraph::DefStack::Iterator::Iterator(const DataFlowGraph::DefStack &S,
+      bool Top) : DS(S) {
+  if (!Top) {
+    // Initialize to bottom.
+    Pos = 0;
+    return;
+  }
+  // Initialize to the top, i.e. top-most non-delimiter (or 0, if empty).
+  Pos = DS.Stack.size();
+  while (Pos > 0 && DS.isDelimiter(DS.Stack[Pos-1]))
+    Pos--;
+}
+
+// Return the size of the stack, including block delimiters.
+unsigned DataFlowGraph::DefStack::size() const {
+  unsigned S = 0;
+  for (auto I = top(), E = bottom(); I != E; I.down())
+    S++;
+  return S;
+}
+
+// Remove the top entry from the stack. Remove all intervening delimiters
+// so that after this, the stack is either empty, or the top of the stack
+// is a non-delimiter.
+void DataFlowGraph::DefStack::pop() {
+  assert(!empty());
+  unsigned P = nextDown(Stack.size());
+  Stack.resize(P);
+}
+
+// Push a delimiter for block node N on the stack.
+void DataFlowGraph::DefStack::start_block(NodeId N) {
+  assert(N != 0);
+  Stack.push_back(NodeAddr<DefNode*>(nullptr, N));
+}
+
+// Remove all nodes from the top of the stack, until the delimited for
+// block node N is encountered. Remove the delimiter as well. In effect,
+// this will remove from the stack all definitions from block N.
+void DataFlowGraph::DefStack::clear_block(NodeId N) {
+  assert(N != 0);
+  unsigned P = Stack.size();
+  while (P > 0) {
+    bool Found = isDelimiter(Stack[P-1], N);
+    P--;
+    if (Found)
+      break;
+  }
+  // This will also remove the delimiter, if found.
+  Stack.resize(P);
+}
+
+// Move the stack iterator up by one.
+unsigned DataFlowGraph::DefStack::nextUp(unsigned P) const {
+  // Get the next valid position after P (skipping all delimiters).
+  // The input position P does not have to point to a non-delimiter.
+  unsigned SS = Stack.size();
+  bool IsDelim;
+  assert(P < SS);
+  do {
+    P++;
+    IsDelim = isDelimiter(Stack[P-1]);
+  } while (P < SS && IsDelim);
+  assert(!IsDelim);
+  return P;
+}
+
+// Move the stack iterator down by one.
+unsigned DataFlowGraph::DefStack::nextDown(unsigned P) const {
+  // Get the preceding valid position before P (skipping all delimiters).
+  // The input position P does not have to point to a non-delimiter.
+  assert(P > 0 && P <= Stack.size());
+  bool IsDelim = isDelimiter(Stack[P-1]);
+  do {
+    if (--P == 0)
+      break;
+    IsDelim = isDelimiter(Stack[P-1]);
+  } while (P > 0 && IsDelim);
+  assert(!IsDelim);
+  return P;
+}
+
+// Node management functions.
+
+// Get the pointer to the node with the id N.
+NodeBase *DataFlowGraph::ptr(NodeId N) const {
+  if (N == 0)
+    return nullptr;
+  return Memory.ptr(N);
+}
+
+// Get the id of the node at the address P.
+NodeId DataFlowGraph::id(const NodeBase *P) const {
+  if (P == nullptr)
+    return 0;
+  return Memory.id(P);
+}
+
+// Allocate a new node and set the attributes to Attrs.
+NodeAddr<NodeBase*> DataFlowGraph::newNode(uint16_t Attrs) {
+  NodeAddr<NodeBase*> P = Memory.New();
+  P.Addr->init();
+  P.Addr->setAttrs(Attrs);
+  return P;
+}
+
+// Make a copy of the given node B, except for the data-flow links, which
+// are set to 0.
+NodeAddr<NodeBase*> DataFlowGraph::cloneNode(const NodeAddr<NodeBase*> B) {
+  NodeAddr<NodeBase*> NA = newNode(0);
+  memcpy(NA.Addr, B.Addr, sizeof(NodeBase));
+  // Ref nodes need to have the data-flow links reset.
+  if (NA.Addr->getType() == NodeAttrs::Ref) {
+    NodeAddr<RefNode*> RA = NA;
+    RA.Addr->setReachingDef(0);
+    RA.Addr->setSibling(0);
+    if (NA.Addr->getKind() == NodeAttrs::Def) {
+      NodeAddr<DefNode*> DA = NA;
+      DA.Addr->setReachedDef(0);
+      DA.Addr->setReachedUse(0);
+    }
+  }
+  return NA;
+}
+
+
+// Allocation routines for specific node types/kinds.
+
+NodeAddr<UseNode*> DataFlowGraph::newUse(NodeAddr<InstrNode*> Owner,
+      MachineOperand &Op, uint16_t Flags) {
+  NodeAddr<UseNode*> UA = newNode(NodeAttrs::Ref | NodeAttrs::Use | Flags);
+  UA.Addr->setRegRef(&Op);
+  return UA;
+}
+
+NodeAddr<PhiUseNode*> DataFlowGraph::newPhiUse(NodeAddr<PhiNode*> Owner,
+      RegisterRef RR, NodeAddr<BlockNode*> PredB, uint16_t Flags) {
+  NodeAddr<PhiUseNode*> PUA = newNode(NodeAttrs::Ref | NodeAttrs::Use | Flags);
+  assert(Flags & NodeAttrs::PhiRef);
+  PUA.Addr->setRegRef(RR);
+  PUA.Addr->setPredecessor(PredB.Id);
+  return PUA;
+}
+
+NodeAddr<DefNode*> DataFlowGraph::newDef(NodeAddr<InstrNode*> Owner,
+      MachineOperand &Op, uint16_t Flags) {
+  NodeAddr<DefNode*> DA = newNode(NodeAttrs::Ref | NodeAttrs::Def | Flags);
+  DA.Addr->setRegRef(&Op);
+  return DA;
+}
+
+NodeAddr<DefNode*> DataFlowGraph::newDef(NodeAddr<InstrNode*> Owner,
+      RegisterRef RR, uint16_t Flags) {
+  NodeAddr<DefNode*> DA = newNode(NodeAttrs::Ref | NodeAttrs::Def | Flags);
+  assert(Flags & NodeAttrs::PhiRef);
+  DA.Addr->setRegRef(RR);
+  return DA;
+}
+
+NodeAddr<PhiNode*> DataFlowGraph::newPhi(NodeAddr<BlockNode*> Owner) {
+  NodeAddr<PhiNode*> PA = newNode(NodeAttrs::Code | NodeAttrs::Phi);
+  Owner.Addr->addPhi(PA, *this);
+  return PA;
+}
+
+NodeAddr<StmtNode*> DataFlowGraph::newStmt(NodeAddr<BlockNode*> Owner,
+      MachineInstr *MI) {
+  NodeAddr<StmtNode*> SA = newNode(NodeAttrs::Code | NodeAttrs::Stmt);
+  SA.Addr->setCode(MI);
+  Owner.Addr->addMember(SA, *this);
+  return SA;
+}
+
+NodeAddr<BlockNode*> DataFlowGraph::newBlock(NodeAddr<FuncNode*> Owner,
+      MachineBasicBlock *BB) {
+  NodeAddr<BlockNode*> BA = newNode(NodeAttrs::Code | NodeAttrs::Block);
+  BA.Addr->setCode(BB);
+  Owner.Addr->addMember(BA, *this);
+  return BA;
+}
+
+NodeAddr<FuncNode*> DataFlowGraph::newFunc(MachineFunction *MF) {
+  NodeAddr<FuncNode*> FA = newNode(NodeAttrs::Code | NodeAttrs::Func);
+  FA.Addr->setCode(MF);
+  return FA;
+}
+
+// Build the data flow graph.
+void DataFlowGraph::build() {
+  reset();
+  Func = newFunc(&MF);
+
+  if (MF.empty())
+    return;
+
+  for (auto &B : MF) {
+    auto BA = newBlock(Func, &B);
+    for (auto &I : B) {
+      if (I.isDebugValue())
+        continue;
+      buildStmt(BA, I);
+    }
+  }
+
+  // Collect information about block references.
+  NodeAddr<BlockNode*> EA = Func.Addr->getEntryBlock(*this);
+  BlockRefsMap RefM;
+  buildBlockRefs(EA, RefM);
+
+  // Add function-entry phi nodes.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (auto I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) {
+    NodeAddr<PhiNode*> PA = newPhi(EA);
+    RegisterRef RR = { I->first, 0 };
+    uint16_t PhiFlags = NodeAttrs::PhiRef | NodeAttrs::Preserving;
+    NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags);
+    PA.Addr->addMember(DA, *this);
+  }
+
+  // Build a map "PhiM" which will contain, for each block, the set
+  // of references that will require phi definitions in that block.
+  BlockRefsMap PhiM;
+  auto Blocks = Func.Addr->members(*this);
+  for (NodeAddr<BlockNode*> BA : Blocks)
+    recordDefsForDF(PhiM, RefM, BA);
+  for (NodeAddr<BlockNode*> BA : Blocks)
+    buildPhis(PhiM, RefM, BA);
+
+  // Link all the refs. This will recursively traverse the dominator tree.
+  DefStackMap DM;
+  linkBlockRefs(DM, EA);
+
+  // Finally, remove all unused phi nodes.
+  removeUnusedPhis();
+}
+
+// For each stack in the map DefM, push the delimiter for block B on it.
+void DataFlowGraph::markBlock(NodeId B, DefStackMap &DefM) {
+  // Push block delimiters.
+  for (auto I = DefM.begin(), E = DefM.end(); I != E; ++I)
+    I->second.start_block(B);
+}
+
+// Remove all definitions coming from block B from each stack in DefM.
+void DataFlowGraph::releaseBlock(NodeId B, DefStackMap &DefM) {
+  // Pop all defs from this block from the definition stack. Defs that were
+  // added to the map during the traversal of instructions will not have a
+  // delimiter, but for those, the whole stack will be emptied.
+  for (auto I = DefM.begin(), E = DefM.end(); I != E; ++I)
+    I->second.clear_block(B);
+
+  // Finally, remove empty stacks from the map.
+  for (auto I = DefM.begin(), E = DefM.end(), NextI = I; I != E; I = NextI) {
+    NextI = std::next(I);
+    // This preserves the validity of iterators other than I.
+    if (I->second.empty())
+      DefM.erase(I);
+  }
+}
+
+// Push all definitions from the instruction node IA to an appropriate
+// stack in DefM.
+void DataFlowGraph::pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
+  NodeList Defs = IA.Addr->members_if(IsDef, *this);
+  NodeSet Visited;
+#ifndef NDEBUG
+  RegisterSet Defined;
+#endif
+
+  // The important objectives of this function are:
+  // - to be able to handle instructions both while the graph is being
+  //   constructed, and after the graph has been constructed, and
+  // - maintain proper ordering of definitions on the stack for each
+  //   register reference:
+  //   - if there are two or more related defs in IA (i.e. coming from
+  //     the same machine operand), then only push one def on the stack,
+  //   - if there are multiple unrelated defs of non-overlapping
+  //     subregisters of S, then the stack for S will have both (in an
+  //     unspecified order), but the order does not matter from the data-
+  //     -flow perspective.
+
+  for (NodeAddr<DefNode*> DA : Defs) {
+    if (Visited.count(DA.Id))
+      continue;
+    NodeList Rel = getRelatedRefs(IA, DA);
+    NodeAddr<DefNode*> PDA = Rel.front();
+    // Push the definition on the stack for the register and all aliases.
+    RegisterRef RR = PDA.Addr->getRegRef();
+#ifndef NDEBUG
+    // Assert if the register is defined in two or more unrelated defs.
+    // This could happen if there are two or more def operands defining it.
+    if (!Defined.insert(RR).second) {
+      auto *MI = NodeAddr<StmtNode*>(IA).Addr->getCode();
+      dbgs() << "Multiple definitions of register: "
+             << Print<RegisterRef>(RR, *this) << " in\n  " << *MI
+             << "in BB#" << MI->getParent()->getNumber() << '\n';
+      llvm_unreachable(nullptr);
+    }
+#endif
+    DefM[RR].push(DA);
+    for (auto A : RAI.getAliasSet(RR)) {
+      assert(A != RR);
+      DefM[A].push(DA);
+    }
+    // Mark all the related defs as visited.
+    for (auto T : Rel)
+      Visited.insert(T.Id);
+  }
+}
+
+// Return the list of all reference nodes related to RA, including RA itself.
+// See "getNextRelated" for the meaning of a "related reference".
+NodeList DataFlowGraph::getRelatedRefs(NodeAddr<InstrNode*> IA,
+      NodeAddr<RefNode*> RA) const {
+  assert(IA.Id != 0 && RA.Id != 0);
+
+  NodeList Refs;
+  NodeId Start = RA.Id;
+  do {
+    Refs.push_back(RA);
+    RA = getNextRelated(IA, RA);
+  } while (RA.Id != 0 && RA.Id != Start);
+  return Refs;
+}
+
+
+// Clear all information in the graph.
+void DataFlowGraph::reset() {
+  Memory.clear();
+  Func = NodeAddr<FuncNode*>();
+}
+
+
+// Return the next reference node in the instruction node IA that is related
+// to RA. Conceptually, two reference nodes are related if they refer to the
+// same instance of a register access, but differ in flags or other minor
+// characteristics. Specific examples of related nodes are shadow reference
+// nodes.
+// Return the equivalent of nullptr if there are no more related references.
+NodeAddr<RefNode*> DataFlowGraph::getNextRelated(NodeAddr<InstrNode*> IA,
+      NodeAddr<RefNode*> RA) const {
+  assert(IA.Id != 0 && RA.Id != 0);
+
+  auto Related = [RA](NodeAddr<RefNode*> TA) -> bool {
+    if (TA.Addr->getKind() != RA.Addr->getKind())
+      return false;
+    if (TA.Addr->getRegRef() != RA.Addr->getRegRef())
+      return false;
+    return true;
+  };
+  auto RelatedStmt = [&Related,RA](NodeAddr<RefNode*> TA) -> bool {
+    return Related(TA) &&
+           &RA.Addr->getOp() == &TA.Addr->getOp();
+  };
+  auto RelatedPhi = [&Related,RA](NodeAddr<RefNode*> TA) -> bool {
+    if (!Related(TA))
+      return false;
+    if (TA.Addr->getKind() != NodeAttrs::Use)
+      return true;
+    // For phi uses, compare predecessor blocks.
+    const NodeAddr<const PhiUseNode*> TUA = TA;
+    const NodeAddr<const PhiUseNode*> RUA = RA;
+    return TUA.Addr->getPredecessor() == RUA.Addr->getPredecessor();
+  };
+
+  RegisterRef RR = RA.Addr->getRegRef();
+  if (IA.Addr->getKind() == NodeAttrs::Stmt)
+    return RA.Addr->getNextRef(RR, RelatedStmt, true, *this);
+  return RA.Addr->getNextRef(RR, RelatedPhi, true, *this);
+}
+
+// Find the next node related to RA in IA that satisfies condition P.
+// If such a node was found, return a pair where the second element is the
+// located node. If such a node does not exist, return a pair where the
+// first element is the element after which such a node should be inserted,
+// and the second element is a null-address.
+template <typename Predicate>
+std::pair<NodeAddr<RefNode*>,NodeAddr<RefNode*>>
+DataFlowGraph::locateNextRef(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA,
+      Predicate P) const {
+  assert(IA.Id != 0 && RA.Id != 0);
+
+  NodeAddr<RefNode*> NA;
+  NodeId Start = RA.Id;
+  while (true) {
+    NA = getNextRelated(IA, RA);
+    if (NA.Id == 0 || NA.Id == Start)
+      break;
+    if (P(NA))
+      break;
+    RA = NA;
+  }
+
+  if (NA.Id != 0 && NA.Id != Start)
+    return std::make_pair(RA, NA);
+  return std::make_pair(RA, NodeAddr<RefNode*>());
+}
+
+// Get the next shadow node in IA corresponding to RA, and optionally create
+// such a node if it does not exist.
+NodeAddr<RefNode*> DataFlowGraph::getNextShadow(NodeAddr<InstrNode*> IA,
+      NodeAddr<RefNode*> RA, bool Create) {
+  assert(IA.Id != 0 && RA.Id != 0);
+
+  uint16_t Flags = RA.Addr->getFlags() | NodeAttrs::Shadow;
+  auto IsShadow = [Flags] (NodeAddr<RefNode*> TA) -> bool {
+    return TA.Addr->getFlags() == Flags;
+  };
+  auto Loc = locateNextRef(IA, RA, IsShadow);
+  if (Loc.second.Id != 0 || !Create)
+    return Loc.second;
+
+  // Create a copy of RA and mark is as shadow.
+  NodeAddr<RefNode*> NA = cloneNode(RA);
+  NA.Addr->setFlags(Flags | NodeAttrs::Shadow);
+  IA.Addr->addMemberAfter(Loc.first, NA, *this);
+  return NA;
+}
+
+// Get the next shadow node in IA corresponding to RA. Return null-address
+// if such a node does not exist.
+NodeAddr<RefNode*> DataFlowGraph::getNextShadow(NodeAddr<InstrNode*> IA,
+      NodeAddr<RefNode*> RA) const {
+  assert(IA.Id != 0 && RA.Id != 0);
+  uint16_t Flags = RA.Addr->getFlags() | NodeAttrs::Shadow;
+  auto IsShadow = [Flags] (NodeAddr<RefNode*> TA) -> bool {
+    return TA.Addr->getFlags() == Flags;
+  };
+  return locateNextRef(IA, RA, IsShadow).second;
+}
+
+// Create a new statement node in the block node BA that corresponds to
+// the machine instruction MI.
+void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
+  auto SA = newStmt(BA, &In);
+
+  // Collect a set of registers that this instruction implicitly uses
+  // or defines. Implicit operands from an instruction will be ignored
+  // unless they are listed here.
+  RegisterSet ImpUses, ImpDefs;
+  if (const uint16_t *ImpD = In.getDesc().getImplicitDefs())
+    while (uint16_t R = *ImpD++)
+      ImpDefs.insert({R, 0});
+  if (const uint16_t *ImpU = In.getDesc().getImplicitUses())
+    while (uint16_t R = *ImpU++)
+      ImpUses.insert({R, 0});
+
+  bool IsCall = In.isCall(), IsReturn = In.isReturn();
+  bool IsPredicated = TII.isPredicated(&In);
+  unsigned NumOps = In.getNumOperands();
+
+  // Avoid duplicate implicit defs. This will not detect cases of implicit
+  // defs that define registers that overlap, but it is not clear how to
+  // interpret that in the absence of explicit defs. Overlapping explicit
+  // defs are likely illegal already.
+  RegisterSet DoneDefs;
+  // Process explicit defs first.
+  for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
+    MachineOperand &Op = In.getOperand(OpN);
+    if (!Op.isReg() || !Op.isDef() || Op.isImplicit())
+      continue;
+    RegisterRef RR = { Op.getReg(), Op.getSubReg() };
+    uint16_t Flags = NodeAttrs::None;
+    if (TOI.isPreserving(In, OpN))
+      Flags |= NodeAttrs::Preserving;
+    if (TOI.isClobbering(In, OpN))
+      Flags |= NodeAttrs::Clobbering;
+    if (TOI.isFixedReg(In, OpN))
+      Flags |= NodeAttrs::Fixed;
+    NodeAddr<DefNode*> DA = newDef(SA, Op, Flags);
+    SA.Addr->addMember(DA, *this);
+    DoneDefs.insert(RR);
+  }
+
+  // Process implicit defs, skipping those that have already been added
+  // as explicit.
+  for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
+    MachineOperand &Op = In.getOperand(OpN);
+    if (!Op.isReg() || !Op.isDef() || !Op.isImplicit())
+      continue;
+    RegisterRef RR = { Op.getReg(), Op.getSubReg() };
+    if (!IsCall && !ImpDefs.count(RR))
+      continue;
+    if (DoneDefs.count(RR))
+      continue;
+    uint16_t Flags = NodeAttrs::None;
+    if (TOI.isPreserving(In, OpN))
+      Flags |= NodeAttrs::Preserving;
+    if (TOI.isClobbering(In, OpN))
+      Flags |= NodeAttrs::Clobbering;
+    if (TOI.isFixedReg(In, OpN))
+      Flags |= NodeAttrs::Fixed;
+    NodeAddr<DefNode*> DA = newDef(SA, Op, Flags);
+    SA.Addr->addMember(DA, *this);
+    DoneDefs.insert(RR);
+  }
+
+  for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
+    MachineOperand &Op = In.getOperand(OpN);
+    if (!Op.isReg() || !Op.isUse())
+      continue;
+    RegisterRef RR = { Op.getReg(), Op.getSubReg() };
+    // Add implicit uses on return and call instructions, and on predicated
+    // instructions regardless of whether or not they appear in the instruction
+    // descriptor's list.
+    bool Implicit = Op.isImplicit();
+    bool TakeImplicit = IsReturn || IsCall || IsPredicated;
+    if (Implicit && !TakeImplicit && !ImpUses.count(RR))
+      continue;
+    uint16_t Flags = NodeAttrs::None;
+    if (TOI.isFixedReg(In, OpN))
+      Flags |= NodeAttrs::Fixed;
+    NodeAddr<UseNode*> UA = newUse(SA, Op, Flags);
+    SA.Addr->addMember(UA, *this);
+  }
+}
+
+// Build a map that for each block will have the set of all references from
+// that block, and from all blocks dominated by it.
+void DataFlowGraph::buildBlockRefs(NodeAddr<BlockNode*> BA,
+      BlockRefsMap &RefM) {
+  auto &Refs = RefM[BA.Id];
+  MachineDomTreeNode *N = MDT.getNode(BA.Addr->getCode());
+  assert(N);
+  for (auto I : *N) {
+    MachineBasicBlock *SB = I->getBlock();
+    auto SBA = Func.Addr->findBlock(SB, *this);
+    buildBlockRefs(SBA, RefM);
+    const auto &SRs = RefM[SBA.Id];
+    Refs.insert(SRs.begin(), SRs.end());
+  }
+
+  for (NodeAddr<InstrNode*> IA : BA.Addr->members(*this))
+    for (NodeAddr<RefNode*> RA : IA.Addr->members(*this))
+      Refs.insert(RA.Addr->getRegRef());
+}
+
+// Scan all defs in the block node BA and record in PhiM the locations of
+// phi nodes corresponding to these defs.
+void DataFlowGraph::recordDefsForDF(BlockRefsMap &PhiM, BlockRefsMap &RefM,
+      NodeAddr<BlockNode*> BA) {
+  // Check all defs from block BA and record them in each block in BA's
+  // iterated dominance frontier. This information will later be used to
+  // create phi nodes.
+  MachineBasicBlock *BB = BA.Addr->getCode();
+  assert(BB);
+  auto DFLoc = MDF.find(BB);
+  if (DFLoc == MDF.end() || DFLoc->second.empty())
+    return;
+
+  // Traverse all instructions in the block and collect the set of all
+  // defined references. For each reference there will be a phi created
+  // in the block's iterated dominance frontier.
+  // This is done to make sure that each defined reference gets only one
+  // phi node, even if it is defined multiple times.
+  RegisterSet Defs;
+  for (auto I : BA.Addr->members(*this)) {
+    assert(I.Addr->getType() == NodeAttrs::Code);
+    assert(I.Addr->getKind() == NodeAttrs::Phi ||
+           I.Addr->getKind() == NodeAttrs::Stmt);
+    NodeAddr<InstrNode*> IA = I;
+    for (NodeAddr<RefNode*> RA : IA.Addr->members_if(IsDef, *this))
+      Defs.insert(RA.Addr->getRegRef());
+  }
+
+  // Finally, add the set of defs to each block in the iterated dominance
+  // frontier.
+  const MachineDominanceFrontier::DomSetType &DF = DFLoc->second;
+  SetVector<MachineBasicBlock*> IDF(DF.begin(), DF.end());
+  for (unsigned i = 0; i < IDF.size(); ++i) {
+    auto F = MDF.find(IDF[i]);
+    if (F != MDF.end())
+      IDF.insert(F->second.begin(), F->second.end());
+  }
+
+  // Get the register references that are reachable from this block.
+  RegisterSet &Refs = RefM[BA.Id];
+  for (auto DB : IDF) {
+    auto DBA = Func.Addr->findBlock(DB, *this);
+    const auto &Rs = RefM[DBA.Id];
+    Refs.insert(Rs.begin(), Rs.end());
+  }
+
+  for (auto DB : IDF) {
+    auto DBA = Func.Addr->findBlock(DB, *this);
+    PhiM[DBA.Id].insert(Defs.begin(), Defs.end());
+  }
+}
+
+// Given the locations of phi nodes in the map PhiM, create the phi nodes
+// that are located in the block node BA.
+void DataFlowGraph::buildPhis(BlockRefsMap &PhiM, BlockRefsMap &RefM,
+      NodeAddr<BlockNode*> BA) {
+  // Check if this blocks has any DF defs, i.e. if there are any defs
+  // that this block is in the iterated dominance frontier of.
+  auto HasDF = PhiM.find(BA.Id);
+  if (HasDF == PhiM.end() || HasDF->second.empty())
+    return;
+
+  // First, remove all R in Refs in such that there exists T in Refs
+  // such that T covers R. In other words, only leave those refs that
+  // are not covered by another ref (i.e. maximal with respect to covering).
+
+  auto MaxCoverIn = [this] (RegisterRef RR, RegisterSet &RRs) -> RegisterRef {
+    for (auto I : RRs)
+      if (I != RR && RAI.covers(I, RR))
+        RR = I;
+    return RR;
+  };
+
+  RegisterSet MaxDF;
+  for (auto I : HasDF->second)
+    MaxDF.insert(MaxCoverIn(I, HasDF->second));
+
+  std::vector<RegisterRef> MaxRefs;
+  auto &RefB = RefM[BA.Id];
+  for (auto I : MaxDF)
+    MaxRefs.push_back(MaxCoverIn(I, RefB));
+
+  // Now, for each R in MaxRefs, get the alias closure of R. If the closure
+  // only has R in it, create a phi a def for R. Otherwise, create a phi,
+  // and add a def for each S in the closure.
+
+  // Sort the refs so that the phis will be created in a deterministic order.
+  std::sort(MaxRefs.begin(), MaxRefs.end());
+  // Remove duplicates.
+  auto NewEnd = std::unique(MaxRefs.begin(), MaxRefs.end());
+  MaxRefs.erase(NewEnd, MaxRefs.end());
+
+  auto Aliased = [this,&MaxRefs](RegisterRef RR,
+                                 std::vector<unsigned> &Closure) -> bool {
+    for (auto I : Closure)
+      if (RAI.alias(RR, MaxRefs[I]))
+        return true;
+    return false;
+  };
+
+  // Prepare a list of NodeIds of the block's predecessors.
+  std::vector<NodeId> PredList;
+  const MachineBasicBlock *MBB = BA.Addr->getCode();
+  for (auto PB : MBB->predecessors()) {
+    auto B = Func.Addr->findBlock(PB, *this);
+    PredList.push_back(B.Id);
+  }
+
+  while (!MaxRefs.empty()) {
+    // Put the first element in the closure, and then add all subsequent
+    // elements from MaxRefs to it, if they alias at least one element
+    // already in the closure.
+    // ClosureIdx: vector of indices in MaxRefs of members of the closure.
+    std::vector<unsigned> ClosureIdx = { 0 };
+    for (unsigned i = 1; i != MaxRefs.size(); ++i)
+      if (Aliased(MaxRefs[i], ClosureIdx))
+        ClosureIdx.push_back(i);
+
+    // Build a phi for the closure.
+    unsigned CS = ClosureIdx.size();
+    NodeAddr<PhiNode*> PA = newPhi(BA);
+
+    // Add defs.
+    for (unsigned X = 0; X != CS; ++X) {
+      RegisterRef RR = MaxRefs[ClosureIdx[X]];
+      uint16_t PhiFlags = NodeAttrs::PhiRef | NodeAttrs::Preserving;
+      NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags);
+      PA.Addr->addMember(DA, *this);
+    }
+    // Add phi uses.
+    for (auto P : PredList) {
+      auto PBA = addr<BlockNode*>(P);
+      for (unsigned X = 0; X != CS; ++X) {
+        RegisterRef RR = MaxRefs[ClosureIdx[X]];
+        NodeAddr<PhiUseNode*> PUA = newPhiUse(PA, RR, PBA);
+        PA.Addr->addMember(PUA, *this);
+      }
+    }
+
+    // Erase from MaxRefs all elements in the closure.
+    auto Begin = MaxRefs.begin();
+    for (unsigned i = ClosureIdx.size(); i != 0; --i)
+      MaxRefs.erase(Begin + ClosureIdx[i-1]);
+  }
+}
+
+// Remove any unneeded phi nodes that were created during the build process.
+void DataFlowGraph::removeUnusedPhis() {
+  // This will remove unused phis, i.e. phis where each def does not reach
+  // any uses or other defs. This will not detect or remove circular phi
+  // chains that are otherwise dead. Unused/dead phis are created during
+  // the build process and this function is intended to remove these cases
+  // that are easily determinable to be unnecessary.
+
+  SetVector<NodeId> PhiQ;
+  for (NodeAddr<BlockNode*> BA : Func.Addr->members(*this)) {
+    for (auto P : BA.Addr->members_if(IsPhi, *this))
+      PhiQ.insert(P.Id);
+  }
+
+  static auto HasUsedDef = [](NodeList &Ms) -> bool {
+    for (auto M : Ms) {
+      if (M.Addr->getKind() != NodeAttrs::Def)
+        continue;
+      NodeAddr<DefNode*> DA = M;
+      if (DA.Addr->getReachedDef() != 0 || DA.Addr->getReachedUse() != 0)
+        return true;
+    }
+    return false;
+  };
+
+  // Any phi, if it is removed, may affect other phis (make them dead).
+  // For each removed phi, collect the potentially affected phis and add
+  // them back to the queue.
+  while (!PhiQ.empty()) {
+    auto PA = addr<PhiNode*>(PhiQ[0]);
+    PhiQ.remove(PA.Id);
+    NodeList Refs = PA.Addr->members(*this);
+    if (HasUsedDef(Refs))
+      continue;
+    for (NodeAddr<RefNode*> RA : Refs) {
+      if (NodeId RD = RA.Addr->getReachingDef()) {
+        auto RDA = addr<DefNode*>(RD);
+        NodeAddr<InstrNode*> OA = RDA.Addr->getOwner(*this);
+        if (IsPhi(OA))
+          PhiQ.insert(OA.Id);
+      }
+      if (RA.Addr->isDef())
+        unlinkDef(RA);
+      else
+        unlinkUse(RA);
+    }
+    NodeAddr<BlockNode*> BA = PA.Addr->getOwner(*this);
+    BA.Addr->removeMember(PA, *this);
+  }
+}
+
+// For a given reference node TA in an instruction node IA, connect the
+// reaching def of TA to the appropriate def node. Create any shadow nodes
+// as appropriate.
+template <typename T>
+void DataFlowGraph::linkRefUp(NodeAddr<InstrNode*> IA, NodeAddr<T> TA,
+      DefStack &DS) {
+  if (DS.empty())
+    return;
+  RegisterRef RR = TA.Addr->getRegRef();
+  NodeAddr<T> TAP;
+
+  // References from the def stack that have been examined so far.
+  RegisterSet Defs;
+
+  for (auto I = DS.top(), E = DS.bottom(); I != E; I.down()) {
+    RegisterRef QR = I->Addr->getRegRef();
+    auto AliasQR = [QR,this] (RegisterRef RR) -> bool {
+      return RAI.alias(QR, RR);
+    };
+    bool PrecUp = RAI.covers(QR, RR);
+    // Skip all defs that are aliased to any of the defs that we have already
+    // seen. If we encounter a covering def, stop the stack traversal early.
+    if (std::any_of(Defs.begin(), Defs.end(), AliasQR)) {
+      if (PrecUp)
+        break;
+      continue;
+    }
+    // The reaching def.
+    NodeAddr<DefNode*> RDA = *I;
+
+    // Pick the reached node.
+    if (TAP.Id == 0) {
+      TAP = TA;
+    } else {
+      // Mark the existing ref as "shadow" and create a new shadow.
+      TAP.Addr->setFlags(TAP.Addr->getFlags() | NodeAttrs::Shadow);
+      TAP = getNextShadow(IA, TAP, true);
+    }
+
+    // Create the link.
+    TAP.Addr->linkToDef(TAP.Id, RDA);
+
+    if (PrecUp)
+      break;
+    Defs.insert(QR);
+  }
+}
+
+// Create data-flow links for all reference nodes in the statement node SA.
+void DataFlowGraph::linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA) {
+  RegisterSet Defs;
+
+  // Link all nodes (upwards in the data-flow) with their reaching defs.
+  for (NodeAddr<RefNode*> RA : SA.Addr->members(*this)) {
+    uint16_t Kind = RA.Addr->getKind();
+    assert(Kind == NodeAttrs::Def || Kind == NodeAttrs::Use);
+    RegisterRef RR = RA.Addr->getRegRef();
+    // Do not process multiple defs of the same reference.
+    if (Kind == NodeAttrs::Def && Defs.count(RR))
+      continue;
+    Defs.insert(RR);
+
+    auto F = DefM.find(RR);
+    if (F == DefM.end())
+      continue;
+    DefStack &DS = F->second;
+    if (Kind == NodeAttrs::Use)
+      linkRefUp<UseNode*>(SA, RA, DS);
+    else if (Kind == NodeAttrs::Def)
+      linkRefUp<DefNode*>(SA, RA, DS);
+    else
+      llvm_unreachable("Unexpected node in instruction");
+  }
+}
+
+// Create data-flow links for all instructions in the block node BA. This
+// will include updating any phi nodes in BA.
+void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA) {
+  // Push block delimiters.
+  markBlock(BA.Id, DefM);
+
+  // For each non-phi instruction in the block, link all the defs and uses
+  // to their reaching defs. For any member of the block (including phis),
+  // push the defs on the corresponding stacks.
+  for (NodeAddr<InstrNode*> IA : BA.Addr->members(*this)) {
+    // Ignore phi nodes here. They will be linked part by part from the
+    // predecessors.
+    if (IA.Addr->getKind() == NodeAttrs::Stmt)
+      linkStmtRefs(DefM, IA);
+
+    // Push the definitions on the stack.
+    pushDefs(IA, DefM);
+  }
+
+  // Recursively process all children in the dominator tree.
+  MachineDomTreeNode *N = MDT.getNode(BA.Addr->getCode());
+  for (auto I : *N) {
+    MachineBasicBlock *SB = I->getBlock();
+    auto SBA = Func.Addr->findBlock(SB, *this);
+    linkBlockRefs(DefM, SBA);
+  }
+
+  // Link the phi uses from the successor blocks.
+  auto IsUseForBA = [BA](NodeAddr<NodeBase*> NA) -> bool {
+    if (NA.Addr->getKind() != NodeAttrs::Use)
+      return false;
+    assert(NA.Addr->getFlags() & NodeAttrs::PhiRef);
+    NodeAddr<PhiUseNode*> PUA = NA;
+    return PUA.Addr->getPredecessor() == BA.Id;
+  };
+  MachineBasicBlock *MBB = BA.Addr->getCode();
+  for (auto SB : MBB->successors()) {
+    auto SBA = Func.Addr->findBlock(SB, *this);
+    for (NodeAddr<InstrNode*> IA : SBA.Addr->members_if(IsPhi, *this)) {
+      // Go over each phi use associated with MBB, and link it.
+      for (auto U : IA.Addr->members_if(IsUseForBA, *this)) {
+        NodeAddr<PhiUseNode*> PUA = U;
+        RegisterRef RR = PUA.Addr->getRegRef();
+        linkRefUp<UseNode*>(IA, PUA, DefM[RR]);
+      }
+    }
+  }
+
+  // Pop all defs from this block from the definition stacks.
+  releaseBlock(BA.Id, DefM);
+}
+
+// Remove the use node UA from any data-flow and structural links.
+void DataFlowGraph::unlinkUse(NodeAddr<UseNode*> UA) {
+  NodeId RD = UA.Addr->getReachingDef();
+  NodeId Sib = UA.Addr->getSibling();
+
+  NodeAddr<InstrNode*> IA = UA.Addr->getOwner(*this);
+  IA.Addr->removeMember(UA, *this);
+
+  if (RD == 0) {
+    assert(Sib == 0);
+    return;
+  }
+
+  auto RDA = addr<DefNode*>(RD);
+  auto TA = addr<UseNode*>(RDA.Addr->getReachedUse());
+  if (TA.Id == UA.Id) {
+    RDA.Addr->setReachedUse(Sib);
+    return;
+  }
+
+  while (TA.Id != 0) {
+    NodeId S = TA.Addr->getSibling();
+    if (S == UA.Id) {
+      TA.Addr->setSibling(UA.Addr->getSibling());
+      return;
+    }
+    TA = addr<UseNode*>(S);
+  }
+}
+
+// Remove the def node DA from any data-flow and structural links.
+void DataFlowGraph::unlinkDef(NodeAddr<DefNode*> DA) {
+  //
+  //         RD
+  //         | reached
+  //         | def
+  //         :
+  //         .
+  //        +----+
+  // ... -- | DA | -- ... -- 0  : sibling chain of DA
+  //        +----+
+  //         |  | reached
+  //         |  : def
+  //         |  .
+  //         | ...  : Siblings (defs)
+  //         |
+  //         : reached
+  //         . use
+  //        ... : sibling chain of reached uses
+
+  NodeId RD = DA.Addr->getReachingDef();
+
+  // Visit all siblings of the reached def and reset their reaching defs.
+  // Also, defs reached by DA are now "promoted" to being reached by RD,
+  // so all of them will need to be spliced into the sibling chain where
+  // DA belongs.
+  auto getAllNodes = [this] (NodeId N) -> NodeList {
+    NodeList Res;
+    while (N) {
+      auto RA = addr<RefNode*>(N);
+      // Keep the nodes in the exact sibling order.
+      Res.push_back(RA);
+      N = RA.Addr->getSibling();
+    }
+    return Res;
+  };
+  NodeList ReachedDefs = getAllNodes(DA.Addr->getReachedDef());
+  NodeList ReachedUses = getAllNodes(DA.Addr->getReachedUse());
+
+  if (RD == 0) {
+    for (NodeAddr<RefNode*> I : ReachedDefs)
+      I.Addr->setSibling(0);
+    for (NodeAddr<RefNode*> I : ReachedUses)
+      I.Addr->setSibling(0);
+  }
+  for (NodeAddr<DefNode*> I : ReachedDefs)
+    I.Addr->setReachingDef(RD);
+  for (NodeAddr<UseNode*> I : ReachedUses)
+    I.Addr->setReachingDef(RD);
+
+  NodeId Sib = DA.Addr->getSibling();
+  if (RD == 0) {
+    assert(Sib == 0);
+    return;
+  }
+
+  // Update the reaching def node and remove DA from the sibling list.
+  auto RDA = addr<DefNode*>(RD);
+  auto TA = addr<DefNode*>(RDA.Addr->getReachedDef());
+  if (TA.Id == DA.Id) {
+    // If DA is the first reached def, just update the RD's reached def
+    // to the DA's sibling.
+    RDA.Addr->setReachedDef(Sib);
+  } else {
+    // Otherwise, traverse the sibling list of the reached defs and remove
+    // DA from it.
+    while (TA.Id != 0) {
+      NodeId S = TA.Addr->getSibling();
+      if (S == DA.Id) {
+        TA.Addr->setSibling(Sib);
+        break;
+      }
+      TA = addr<DefNode*>(S);
+    }
+  }
+
+  // Splice the DA's reached defs into the RDA's reached def chain.
+  if (!ReachedDefs.empty()) {
+    auto Last = NodeAddr<DefNode*>(ReachedDefs.back());
+    Last.Addr->setSibling(RDA.Addr->getReachedDef());
+    RDA.Addr->setReachedDef(ReachedDefs.front().Id);
+  }
+  // Splice the DA's reached uses into the RDA's reached use chain.
+  if (!ReachedUses.empty()) {
+    auto Last = NodeAddr<UseNode*>(ReachedUses.back());
+    Last.Addr->setSibling(RDA.Addr->getReachedUse());
+    RDA.Addr->setReachedUse(ReachedUses.front().Id);
+  }
+
+  NodeAddr<InstrNode*> IA = DA.Addr->getOwner(*this);
+  IA.Addr->removeMember(DA, *this);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFGraph.h b/contrib/llvm/lib/Target/Hexagon/RDFGraph.h
new file mode 100644
index 000000000000..7da7bb5973cf
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFGraph.h
@@ -0,0 +1,841 @@
+//===--- RDFGraph.h -------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Target-independent, SSA-based data flow graph for register data flow (RDF)
+// for a non-SSA program representation (e.g. post-RA machine code).
+//
+//
+// *** Introduction
+//
+// The RDF graph is a collection of nodes, each of which denotes some element
+// of the program. There are two main types of such elements: code and refe-
+// rences. Conceptually, "code" is something that represents the structure
+// of the program, e.g. basic block or a statement, while "reference" is an
+// instance of accessing a register, e.g. a definition or a use. Nodes are
+// connected with each other based on the structure of the program (such as
+// blocks, instructions, etc.), and based on the data flow (e.g. reaching
+// definitions, reached uses, etc.). The single-reaching-definition principle
+// of SSA is generally observed, although, due to the non-SSA representation
+// of the program, there are some differences between the graph and a "pure"
+// SSA representation.
+//
+//
+// *** Implementation remarks
+//
+// Since the graph can contain a large number of nodes, memory consumption
+// was one of the major design considerations. As a result, there is a single
+// base class NodeBase which defines all members used by all possible derived
+// classes. The members are arranged in a union, and a derived class cannot
+// add any data members of its own. Each derived class only defines the
+// functional interface, i.e. member functions. NodeBase must be a POD,
+// which implies that all of its members must also be PODs.
+// Since nodes need to be connected with other nodes, pointers have been
+// replaced with 32-bit identifiers: each node has an id of type NodeId.
+// There are mapping functions in the graph that translate between actual
+// memory addresses and the corresponding identifiers.
+// A node id of 0 is equivalent to nullptr.
+//
+//
+// *** Structure of the graph
+//
+// A code node is always a collection of other nodes. For example, a code
+// node corresponding to a basic block will contain code nodes corresponding
+// to instructions. In turn, a code node corresponding to an instruction will
+// contain a list of reference nodes that correspond to the definitions and
+// uses of registers in that instruction. The members are arranged into a
+// circular list, which is yet another consequence of the effort to save
+// memory: for each member node it should be possible to obtain its owner,
+// and it should be possible to access all other members. There are other
+// ways to accomplish that, but the circular list seemed the most natural.
+//
+// +- CodeNode -+
+// |            | <---------------------------------------------------+
+// +-+--------+-+                                                     |
+//   |FirstM  |LastM                                                  |
+//   |        +-------------------------------------+                 |
+//   |                                              |                 |
+//   V                                              V                 |
+//  +----------+ Next +----------+ Next       Next +----------+ Next  |
+//  |          |----->|          |-----> ... ----->|          |----->-+
+//  +- Member -+      +- Member -+                 +- Member -+
+//
+// The order of members is such that related reference nodes (see below)
+// should be contiguous on the member list.
+//
+// A reference node is a node that encapsulates an access to a register,
+// in other words, data flowing into or out of a register. There are two
+// major kinds of reference nodes: defs and uses. A def node will contain
+// the id of the first reached use, and the id of the first reached def.
+// Each def and use will contain the id of the reaching def, and also the
+// id of the next reached def (for def nodes) or use (for use nodes).
+// The "next node sharing the same reaching def" is denoted as "sibling".
+// In summary:
+// - Def node contains: reaching def, sibling, first reached def, and first
+// reached use.
+// - Use node contains: reaching def and sibling.
+//
+// +-- DefNode --+
+// | R2 = ...    | <---+--------------------+
+// ++---------+--+     |                    |
+//  |Reached  |Reached |                    |
+//  |Def      |Use     |                    |
+//  |         |        |Reaching            |Reaching
+//  |         V        |Def                 |Def
+//  |      +-- UseNode --+ Sib  +-- UseNode --+ Sib       Sib
+//  |      | ... = R2    |----->| ... = R2    |----> ... ----> 0
+//  |      +-------------+      +-------------+
+//  V
+// +-- DefNode --+ Sib
+// | R2 = ...    |----> ...
+// ++---------+--+
+//  |         |
+//  |         |
+// ...       ...
+//
+// To get a full picture, the circular lists connecting blocks within a
+// function, instructions within a block, etc. should be superimposed with
+// the def-def, def-use links shown above.
+// To illustrate this, consider a small example in a pseudo-assembly:
+// foo:
+//   add r2, r0, r1   ; r2 = r0+r1
+//   addi r0, r2, 1   ; r0 = r2+1
+//   ret r0           ; return value in r0
+//
+// The graph (in a format used by the debugging functions) would look like:
+//
+//   DFG dump:[
+//   f1: Function foo
+//   b2: === BB#0 === preds(0), succs(0):
+//   p3: phi [d4<r0>(,d12,u9):]
+//   p5: phi [d6<r1>(,,u10):]
+//   s7: add [d8<r2>(,,u13):, u9<r0>(d4):, u10<r1>(d6):]
+//   s11: addi [d12<r0>(d4,,u15):, u13<r2>(d8):]
+//   s14: ret [u15<r0>(d12):]
+//   ]
+//
+// The f1, b2, p3, etc. are node ids. The letter is prepended to indicate the
+// kind of the node (i.e. f - function, b - basic block, p - phi, s - state-
+// ment, d - def, u - use).
+// The format of a def node is:
+//   dN<R>(rd,d,u):sib,
+// where
+//   N   - numeric node id,
+//   R   - register being defined
+//   rd  - reaching def,
+//   d   - reached def,
+//   u   - reached use,
+//   sib - sibling.
+// The format of a use node is:
+//   uN<R>[!](rd):sib,
+// where
+//   N   - numeric node id,
+//   R   - register being used,
+//   rd  - reaching def,
+//   sib - sibling.
+// Possible annotations (usually preceding the node id):
+//   +   - preserving def,
+//   ~   - clobbering def,
+//   "   - shadow ref (follows the node id),
+//   !   - fixed register (appears after register name).
+//
+// The circular lists are not explicit in the dump.
+//
+//
+// *** Node attributes
+//
+// NodeBase has a member "Attrs", which is the primary way of determining
+// the node's characteristics. The fields in this member decide whether
+// the node is a code node or a reference node (i.e. node's "type"), then
+// within each type, the "kind" determines what specifically this node
+// represents. The remaining bits, "flags", contain additional information
+// that is even more detailed than the "kind".
+// CodeNode's kinds are:
+// - Phi:   Phi node, members are reference nodes.
+// - Stmt:  Statement, members are reference nodes.
+// - Block: Basic block, members are instruction nodes (i.e. Phi or Stmt).
+// - Func:  The whole function. The members are basic block nodes.
+// RefNode's kinds are:
+// - Use.
+// - Def.
+//
+// Meaning of flags:
+// - Preserving: applies only to defs. A preserving def is one that can
+//   preserve some of the original bits among those that are included in
+//   the register associated with that def. For example, if R0 is a 32-bit
+//   register, but a def can only change the lower 16 bits, then it will
+//   be marked as preserving.
+// - Shadow: a reference that has duplicates holding additional reaching
+//   defs (see more below).
+// - Clobbering: applied only to defs, indicates that the value generated
+//   by this def is unspecified. A typical example would be volatile registers
+//   after function calls.
+//
+//
+// *** Shadow references
+//
+// It may happen that a super-register can have two (or more) non-overlapping
+// sub-registers. When both of these sub-registers are defined and followed
+// by a use of the super-register, the use of the super-register will not
+// have a unique reaching def: both defs of the sub-registers need to be
+// accounted for. In such cases, a duplicate use of the super-register is
+// added and it points to the extra reaching def. Both uses are marked with
+// a flag "shadow". Example:
+// Assume t0 is a super-register of r0 and r1, r0 and r1 do not overlap:
+//   set r0, 1        ; r0 = 1
+//   set r1, 1        ; r1 = 1
+//   addi t1, t0, 1   ; t1 = t0+1
+//
+// The DFG:
+//   s1: set [d2<r0>(,,u9):]
+//   s3: set [d4<r1>(,,u10):]
+//   s5: addi [d6<t1>(,,):, u7"<t0>(d2):, u8"<t0>(d4):]
+//
+// The statement s5 has two use nodes for t0: u7" and u9". The quotation
+// mark " indicates that the node is a shadow.
+//
+#ifndef RDF_GRAPH_H
+#define RDF_GRAPH_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Timer.h"
+
+#include <functional>
+#include <map>
+#include <set>
+#include <vector>
+
+using namespace llvm;
+
+namespace llvm {
+  class MachineBasicBlock;
+  class MachineFunction;
+  class MachineInstr;
+  class MachineOperand;
+  class MachineDominanceFrontier;
+  class MachineDominatorTree;
+  class TargetInstrInfo;
+  class TargetRegisterInfo;
+}
+
+namespace rdf {
+  typedef uint32_t NodeId;
+
+  struct NodeAttrs {
+    enum : uint16_t {
+      None          = 0x0000,   // Nothing
+
+      // Types: 2 bits
+      TypeMask      = 0x0003,
+      Code          = 0x0001,   // 01, Container
+      Ref           = 0x0002,   // 10, Reference
+
+      // Kind: 3 bits
+      KindMask      = 0x0007 << 2,
+      Def           = 0x0001 << 2,  // 001
+      Use           = 0x0002 << 2,  // 010
+      Phi           = 0x0003 << 2,  // 011
+      Stmt          = 0x0004 << 2,  // 100
+      Block         = 0x0005 << 2,  // 101
+      Func          = 0x0006 << 2,  // 110
+
+      // Flags: 5 bits for now
+      FlagMask      = 0x001F << 5,
+      Shadow        = 0x0001 << 5,  // 00001, Has extra reaching defs.
+      Clobbering    = 0x0002 << 5,  // 00010, Produces unspecified values.
+      PhiRef        = 0x0004 << 5,  // 00100, Member of PhiNode.
+      Preserving    = 0x0008 << 5,  // 01000, Def can keep original bits.
+      Fixed         = 0x0010 << 5,  // 10000, Fixed register.
+    };
+
+    static uint16_t type(uint16_t T)  { return T & TypeMask; }
+    static uint16_t kind(uint16_t T)  { return T & KindMask; }
+    static uint16_t flags(uint16_t T) { return T & FlagMask; }
+
+    static uint16_t set_type(uint16_t A, uint16_t T) {
+      return (A & ~TypeMask) | T;
+    }
+    static uint16_t set_kind(uint16_t A, uint16_t K) {
+      return (A & ~KindMask) | K;
+    }
+    static uint16_t set_flags(uint16_t A, uint16_t F) {
+      return (A & ~FlagMask) | F;
+    }
+
+    // Test if A contains B.
+    static bool contains(uint16_t A, uint16_t B) {
+      if (type(A) != Code)
+        return false;
+      uint16_t KB = kind(B);
+      switch (kind(A)) {
+        case Func:
+          return KB == Block;
+        case Block:
+          return KB == Phi || KB == Stmt;
+        case Phi:
+        case Stmt:
+          return type(B) == Ref;
+      }
+      return false;
+    }
+  };
+
+  template <typename T> struct NodeAddr {
+    NodeAddr() : Addr(nullptr), Id(0) {}
+    NodeAddr(T A, NodeId I) : Addr(A), Id(I) {}
+    NodeAddr(const NodeAddr&) = default;
+    NodeAddr &operator= (const NodeAddr&) = default;
+
+    bool operator== (const NodeAddr<T> &NA) const {
+      assert((Addr == NA.Addr) == (Id == NA.Id));
+      return Addr == NA.Addr;
+    }
+    bool operator!= (const NodeAddr<T> &NA) const {
+      return !operator==(NA);
+    }
+    // Type cast (casting constructor). The reason for having this class
+    // instead of std::pair.
+    template <typename S> NodeAddr(const NodeAddr<S> &NA)
+      : Addr(static_cast<T>(NA.Addr)), Id(NA.Id) {}
+
+    T Addr;
+    NodeId Id;
+  };
+
+  struct NodeBase;
+
+  // Fast memory allocation and translation between node id and node address.
+  // This is really the same idea as the one underlying the "bump pointer
+  // allocator", the difference being in the translation. A node id is
+  // composed of two components: the index of the block in which it was
+  // allocated, and the index within the block. With the default settings,
+  // where the number of nodes per block is 4096, the node id (minus 1) is:
+  //
+  // bit position:                11             0
+  // +----------------------------+--------------+
+  // | Index of the block         |Index in block|
+  // +----------------------------+--------------+
+  //
+  // The actual node id is the above plus 1, to avoid creating a node id of 0.
+  //
+  // This method significantly improved the build time, compared to using maps
+  // (std::unordered_map or DenseMap) to translate between pointers and ids.
+  struct NodeAllocator {
+    // Amount of storage for a single node.
+    enum { NodeMemSize = 32 };
+    NodeAllocator(uint32_t NPB = 4096)
+        : NodesPerBlock(NPB), BitsPerIndex(Log2_32(NPB)),
+          IndexMask((1 << BitsPerIndex)-1), ActiveEnd(nullptr) {
+      assert(isPowerOf2_32(NPB));
+    }
+    NodeBase *ptr(NodeId N) const {
+      uint32_t N1 = N-1;
+      uint32_t BlockN = N1 >> BitsPerIndex;
+      uint32_t Offset = (N1 & IndexMask) * NodeMemSize;
+      return reinterpret_cast<NodeBase*>(Blocks[BlockN]+Offset);
+    }
+    NodeId id(const NodeBase *P) const;
+    NodeAddr<NodeBase*> New();
+    void clear();
+
+  private:
+    void startNewBlock();
+    bool needNewBlock();
+    uint32_t makeId(uint32_t Block, uint32_t Index) const {
+      // Add 1 to the id, to avoid the id of 0, which is treated as "null".
+      return ((Block << BitsPerIndex) | Index) + 1;
+    }
+
+    const uint32_t NodesPerBlock;
+    const uint32_t BitsPerIndex;
+    const uint32_t IndexMask;
+    char *ActiveEnd;
+    std::vector<char*> Blocks;
+    typedef BumpPtrAllocatorImpl<MallocAllocator, 65536> AllocatorTy;
+    AllocatorTy MemPool;
+  };
+
+  struct RegisterRef {
+    unsigned Reg, Sub;
+
+    // No non-trivial constructors, since this will be a member of a union.
+    RegisterRef() = default;
+    RegisterRef(const RegisterRef &RR) = default;
+    RegisterRef &operator= (const RegisterRef &RR) = default;
+    bool operator== (const RegisterRef &RR) const {
+      return Reg == RR.Reg && Sub == RR.Sub;
+    }
+    bool operator!= (const RegisterRef &RR) const {
+      return !operator==(RR);
+    }
+    bool operator< (const RegisterRef &RR) const {
+      return Reg < RR.Reg || (Reg == RR.Reg && Sub < RR.Sub);
+    }
+  };
+  typedef std::set<RegisterRef> RegisterSet;
+
+  struct RegisterAliasInfo {
+    RegisterAliasInfo(const TargetRegisterInfo &tri) : TRI(tri) {}
+    virtual ~RegisterAliasInfo() {}
+
+    virtual std::vector<RegisterRef> getAliasSet(RegisterRef RR) const;
+    virtual bool alias(RegisterRef RA, RegisterRef RB) const;
+    virtual bool covers(RegisterRef RA, RegisterRef RB) const;
+    virtual bool covers(const RegisterSet &RRs, RegisterRef RR) const;
+
+    const TargetRegisterInfo &TRI;
+  };
+
+  struct TargetOperandInfo {
+    TargetOperandInfo(const TargetInstrInfo &tii) : TII(tii) {}
+    virtual ~TargetOperandInfo() {}
+    virtual bool isPreserving(const MachineInstr &In, unsigned OpNum) const;
+    virtual bool isClobbering(const MachineInstr &In, unsigned OpNum) const;
+    virtual bool isFixedReg(const MachineInstr &In, unsigned OpNum) const;
+
+    const TargetInstrInfo &TII;
+  };
+
+
+  struct DataFlowGraph;
+
+  struct NodeBase {
+  public:
+    // Make sure this is a POD.
+    NodeBase() = default;
+    uint16_t getType()  const { return NodeAttrs::type(Attrs); }
+    uint16_t getKind()  const { return NodeAttrs::kind(Attrs); }
+    uint16_t getFlags() const { return NodeAttrs::flags(Attrs); }
+    NodeId   getNext()  const { return Next; }
+
+    uint16_t getAttrs() const { return Attrs; }
+    void setAttrs(uint16_t A) { Attrs = A; }
+    void setFlags(uint16_t F) { setAttrs(NodeAttrs::set_flags(getAttrs(), F)); }
+
+    // Insert node NA after "this" in the circular chain.
+    void append(NodeAddr<NodeBase*> NA);
+    // Initialize all members to 0.
+    void init() { memset(this, 0, sizeof *this); }
+    void setNext(NodeId N) { Next = N; }
+
+  protected:
+    uint16_t Attrs;
+    uint16_t Reserved;
+    NodeId Next;                // Id of the next node in the circular chain.
+    // Definitions of nested types. Using anonymous nested structs would make
+    // this class definition clearer, but unnamed structs are not a part of
+    // the standard.
+    struct Def_struct  {
+      NodeId DD, DU;          // Ids of the first reached def and use.
+    };
+    struct PhiU_struct  {
+      NodeId PredB;           // Id of the predecessor block for a phi use.
+    };
+    struct Code_struct {
+      void *CP;               // Pointer to the actual code.
+      NodeId FirstM, LastM;   // Id of the first member and last.
+    };
+    struct Ref_struct {
+      NodeId RD, Sib;         // Ids of the reaching def and the sibling.
+      union {
+        Def_struct Def;
+        PhiU_struct PhiU;
+      };
+      union {
+        MachineOperand *Op;   // Non-phi refs point to a machine operand.
+        RegisterRef RR;       // Phi refs store register info directly.
+      };
+    };
+
+    // The actual payload.
+    union {
+      Ref_struct Ref;
+      Code_struct Code;
+    };
+  };
+  // The allocator allocates chunks of 32 bytes for each node. The fact that
+  // each node takes 32 bytes in memory is used for fast translation between
+  // the node id and the node address.
+  static_assert(sizeof(NodeBase) <= NodeAllocator::NodeMemSize,
+        "NodeBase must be at most NodeAllocator::NodeMemSize bytes");
+
+  typedef std::vector<NodeAddr<NodeBase*>> NodeList;
+  typedef std::set<NodeId> NodeSet;
+
+  struct RefNode : public NodeBase {
+    RefNode() = default;
+    RegisterRef getRegRef() const;
+    MachineOperand &getOp() {
+      assert(!(getFlags() & NodeAttrs::PhiRef));
+      return *Ref.Op;
+    }
+    void setRegRef(RegisterRef RR);
+    void setRegRef(MachineOperand *Op);
+    NodeId getReachingDef() const {
+      return Ref.RD;
+    }
+    void setReachingDef(NodeId RD) {
+      Ref.RD = RD;
+    }
+    NodeId getSibling() const {
+      return Ref.Sib;
+    }
+    void setSibling(NodeId Sib) {
+      Ref.Sib = Sib;
+    }
+    bool isUse() const {
+      assert(getType() == NodeAttrs::Ref);
+      return getKind() == NodeAttrs::Use;
+    }
+    bool isDef() const {
+      assert(getType() == NodeAttrs::Ref);
+      return getKind() == NodeAttrs::Def;
+    }
+
+    template <typename Predicate>
+    NodeAddr<RefNode*> getNextRef(RegisterRef RR, Predicate P, bool NextOnly,
+        const DataFlowGraph &G);
+    NodeAddr<NodeBase*> getOwner(const DataFlowGraph &G);
+  };
+
+  struct DefNode : public RefNode {
+    NodeId getReachedDef() const {
+      return Ref.Def.DD;
+    }
+    void setReachedDef(NodeId D) {
+      Ref.Def.DD = D;
+    }
+    NodeId getReachedUse() const {
+      return Ref.Def.DU;
+    }
+    void setReachedUse(NodeId U) {
+      Ref.Def.DU = U;
+    }
+
+    void linkToDef(NodeId Self, NodeAddr<DefNode*> DA);
+  };
+
+  struct UseNode : public RefNode {
+    void linkToDef(NodeId Self, NodeAddr<DefNode*> DA);
+  };
+
+  struct PhiUseNode : public UseNode {
+    NodeId getPredecessor() const {
+      assert(getFlags() & NodeAttrs::PhiRef);
+      return Ref.PhiU.PredB;
+    }
+    void setPredecessor(NodeId B) {
+      assert(getFlags() & NodeAttrs::PhiRef);
+      Ref.PhiU.PredB = B;
+    }
+  };
+
+  struct CodeNode : public NodeBase {
+    template <typename T> T getCode() const {
+      return static_cast<T>(Code.CP);
+    }
+    void setCode(void *C) {
+      Code.CP = C;
+    }
+
+    NodeAddr<NodeBase*> getFirstMember(const DataFlowGraph &G) const;
+    NodeAddr<NodeBase*> getLastMember(const DataFlowGraph &G) const;
+    void addMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G);
+    void addMemberAfter(NodeAddr<NodeBase*> MA, NodeAddr<NodeBase*> NA,
+        const DataFlowGraph &G);
+    void removeMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G);
+
+    NodeList members(const DataFlowGraph &G) const;
+    template <typename Predicate>
+    NodeList members_if(Predicate P, const DataFlowGraph &G) const;
+  };
+
+  struct InstrNode : public CodeNode {
+    NodeAddr<NodeBase*> getOwner(const DataFlowGraph &G);
+  };
+
+  struct PhiNode : public InstrNode {
+    MachineInstr *getCode() const {
+      return nullptr;
+    }
+  };
+
+  struct StmtNode : public InstrNode {
+    MachineInstr *getCode() const {
+      return CodeNode::getCode<MachineInstr*>();
+    }
+  };
+
+  struct BlockNode : public CodeNode {
+    MachineBasicBlock *getCode() const {
+      return CodeNode::getCode<MachineBasicBlock*>();
+    }
+    void addPhi(NodeAddr<PhiNode*> PA, const DataFlowGraph &G);
+  };
+
+  struct FuncNode : public CodeNode {
+    MachineFunction *getCode() const {
+      return CodeNode::getCode<MachineFunction*>();
+    }
+    NodeAddr<BlockNode*> findBlock(const MachineBasicBlock *BB,
+        const DataFlowGraph &G) const;
+    NodeAddr<BlockNode*> getEntryBlock(const DataFlowGraph &G);
+  };
+
+  struct DataFlowGraph {
+    DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii,
+        const TargetRegisterInfo &tri, const MachineDominatorTree &mdt,
+        const MachineDominanceFrontier &mdf, const RegisterAliasInfo &rai,
+        const TargetOperandInfo &toi);
+
+    NodeBase *ptr(NodeId N) const;
+    template <typename T> T ptr(NodeId N) const {
+      return static_cast<T>(ptr(N));
+    }
+    NodeId id(const NodeBase *P) const;
+
+    template <typename T> NodeAddr<T> addr(NodeId N) const {
+      return { ptr<T>(N), N };
+    }
+
+    NodeAddr<FuncNode*> getFunc() const {
+      return Func;
+    }
+    MachineFunction &getMF() const {
+      return MF;
+    }
+    const TargetInstrInfo &getTII() const {
+      return TII;
+    }
+    const TargetRegisterInfo &getTRI() const {
+      return TRI;
+    }
+    const MachineDominatorTree &getDT() const {
+      return MDT;
+    }
+    const MachineDominanceFrontier &getDF() const {
+      return MDF;
+    }
+    const RegisterAliasInfo &getRAI() const {
+      return RAI;
+    }
+
+    struct DefStack {
+      DefStack() = default;
+      bool empty() const { return Stack.empty() || top() == bottom(); }
+    private:
+      typedef NodeAddr<DefNode*> value_type;
+      struct Iterator {
+        typedef DefStack::value_type value_type;
+        Iterator &up() { Pos = DS.nextUp(Pos); return *this; }
+        Iterator &down() { Pos = DS.nextDown(Pos); return *this; }
+        value_type operator*() const {
+          assert(Pos >= 1);
+          return DS.Stack[Pos-1];
+        }
+        const value_type *operator->() const {
+          assert(Pos >= 1);
+          return &DS.Stack[Pos-1];
+        }
+        bool operator==(const Iterator &It) const { return Pos == It.Pos; }
+        bool operator!=(const Iterator &It) const { return Pos != It.Pos; }
+      private:
+        Iterator(const DefStack &S, bool Top);
+        // Pos-1 is the index in the StorageType object that corresponds to
+        // the top of the DefStack.
+        const DefStack &DS;
+        unsigned Pos;
+        friend struct DefStack;
+      };
+    public:
+      typedef Iterator iterator;
+      iterator top() const { return Iterator(*this, true); }
+      iterator bottom() const { return Iterator(*this, false); }
+      unsigned size() const;
+
+      void push(NodeAddr<DefNode*> DA) { Stack.push_back(DA); }
+      void pop();
+      void start_block(NodeId N);
+      void clear_block(NodeId N);
+    private:
+      friend struct Iterator;
+      typedef std::vector<value_type> StorageType;
+      bool isDelimiter(const StorageType::value_type &P, NodeId N = 0) const {
+        return (P.Addr == nullptr) && (N == 0 || P.Id == N);
+      }
+      unsigned nextUp(unsigned P) const;
+      unsigned nextDown(unsigned P) const;
+      StorageType Stack;
+    };
+
+    typedef std::map<RegisterRef,DefStack> DefStackMap;
+
+    void build();
+    void pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM);
+    void markBlock(NodeId B, DefStackMap &DefM);
+    void releaseBlock(NodeId B, DefStackMap &DefM);
+
+    NodeAddr<RefNode*> getNextRelated(NodeAddr<InstrNode*> IA,
+        NodeAddr<RefNode*> RA) const;
+    NodeAddr<RefNode*> getNextImp(NodeAddr<InstrNode*> IA,
+        NodeAddr<RefNode*> RA, bool Create);
+    NodeAddr<RefNode*> getNextImp(NodeAddr<InstrNode*> IA,
+        NodeAddr<RefNode*> RA) const;
+    NodeAddr<RefNode*> getNextShadow(NodeAddr<InstrNode*> IA,
+        NodeAddr<RefNode*> RA, bool Create);
+    NodeAddr<RefNode*> getNextShadow(NodeAddr<InstrNode*> IA,
+        NodeAddr<RefNode*> RA) const;
+
+    NodeList getRelatedRefs(NodeAddr<InstrNode*> IA,
+        NodeAddr<RefNode*> RA) const;
+
+    void unlinkUse(NodeAddr<UseNode*> UA);
+    void unlinkDef(NodeAddr<DefNode*> DA);
+
+    // Some useful filters.
+    template <uint16_t Kind>
+    static bool IsRef(const NodeAddr<NodeBase*> BA) {
+      return BA.Addr->getType() == NodeAttrs::Ref &&
+             BA.Addr->getKind() == Kind;
+    }
+    template <uint16_t Kind>
+    static bool IsCode(const NodeAddr<NodeBase*> BA) {
+      return BA.Addr->getType() == NodeAttrs::Code &&
+             BA.Addr->getKind() == Kind;
+    }
+    static bool IsDef(const NodeAddr<NodeBase*> BA) {
+      return BA.Addr->getType() == NodeAttrs::Ref &&
+             BA.Addr->getKind() == NodeAttrs::Def;
+    }
+    static bool IsUse(const NodeAddr<NodeBase*> BA) {
+      return BA.Addr->getType() == NodeAttrs::Ref &&
+             BA.Addr->getKind() == NodeAttrs::Use;
+    }
+    static bool IsPhi(const NodeAddr<NodeBase*> BA) {
+      return BA.Addr->getType() == NodeAttrs::Code &&
+             BA.Addr->getKind() == NodeAttrs::Phi;
+    }
+
+  private:
+    void reset();
+
+    NodeAddr<NodeBase*> newNode(uint16_t Attrs);
+    NodeAddr<NodeBase*> cloneNode(const NodeAddr<NodeBase*> B);
+    NodeAddr<UseNode*> newUse(NodeAddr<InstrNode*> Owner,
+        MachineOperand &Op, uint16_t Flags = NodeAttrs::None);
+    NodeAddr<PhiUseNode*> newPhiUse(NodeAddr<PhiNode*> Owner,
+        RegisterRef RR, NodeAddr<BlockNode*> PredB,
+        uint16_t Flags = NodeAttrs::PhiRef);
+    NodeAddr<DefNode*> newDef(NodeAddr<InstrNode*> Owner,
+        MachineOperand &Op, uint16_t Flags = NodeAttrs::None);
+    NodeAddr<DefNode*> newDef(NodeAddr<InstrNode*> Owner,
+        RegisterRef RR, uint16_t Flags = NodeAttrs::PhiRef);
+    NodeAddr<PhiNode*> newPhi(NodeAddr<BlockNode*> Owner);
+    NodeAddr<StmtNode*> newStmt(NodeAddr<BlockNode*> Owner,
+        MachineInstr *MI);
+    NodeAddr<BlockNode*> newBlock(NodeAddr<FuncNode*> Owner,
+        MachineBasicBlock *BB);
+    NodeAddr<FuncNode*> newFunc(MachineFunction *MF);
+
+    template <typename Predicate>
+    std::pair<NodeAddr<RefNode*>,NodeAddr<RefNode*>>
+    locateNextRef(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA,
+        Predicate P) const;
+
+    typedef std::map<NodeId,RegisterSet> BlockRefsMap;
+
+    void buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In);
+    void buildBlockRefs(NodeAddr<BlockNode*> BA, BlockRefsMap &RefM);
+    void recordDefsForDF(BlockRefsMap &PhiM, BlockRefsMap &RefM,
+        NodeAddr<BlockNode*> BA);
+    void buildPhis(BlockRefsMap &PhiM, BlockRefsMap &RefM,
+        NodeAddr<BlockNode*> BA);
+    void removeUnusedPhis();
+
+    template <typename T> void linkRefUp(NodeAddr<InstrNode*> IA,
+        NodeAddr<T> TA, DefStack &DS);
+    void linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA);
+    void linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA);
+
+    TimerGroup TimeG;
+    NodeAddr<FuncNode*> Func;
+    NodeAllocator Memory;
+
+    MachineFunction &MF;
+    const TargetInstrInfo &TII;
+    const TargetRegisterInfo &TRI;
+    const MachineDominatorTree &MDT;
+    const MachineDominanceFrontier &MDF;
+    const RegisterAliasInfo &RAI;
+    const TargetOperandInfo &TOI;
+  };  // struct DataFlowGraph
+
+  template <typename Predicate>
+  NodeAddr<RefNode*> RefNode::getNextRef(RegisterRef RR, Predicate P,
+        bool NextOnly, const DataFlowGraph &G) {
+    // Get the "Next" reference in the circular list that references RR and
+    // satisfies predicate "Pred".
+    auto NA = G.addr<NodeBase*>(getNext());
+
+    while (NA.Addr != this) {
+      if (NA.Addr->getType() == NodeAttrs::Ref) {
+        NodeAddr<RefNode*> RA = NA;
+        if (RA.Addr->getRegRef() == RR && P(NA))
+          return NA;
+        if (NextOnly)
+          break;
+        NA = G.addr<NodeBase*>(NA.Addr->getNext());
+      } else {
+        // We've hit the beginning of the chain.
+        assert(NA.Addr->getType() == NodeAttrs::Code);
+        NodeAddr<CodeNode*> CA = NA;
+        NA = CA.Addr->getFirstMember(G);
+      }
+    }
+    // Return the equivalent of "nullptr" if such a node was not found.
+    return NodeAddr<RefNode*>();
+  }
+
+  template <typename Predicate>
+  NodeList CodeNode::members_if(Predicate P, const DataFlowGraph &G) const {
+    NodeList MM;
+    auto M = getFirstMember(G);
+    if (M.Id == 0)
+      return MM;
+
+    while (M.Addr != this) {
+      if (P(M))
+        MM.push_back(M);
+      M = G.addr<NodeBase*>(M.Addr->getNext());
+    }
+    return MM;
+  }
+
+
+  template <typename T> struct Print;
+  template <typename T>
+  raw_ostream &operator<< (raw_ostream &OS, const Print<T> &P);
+
+  template <typename T>
+  struct Print {
+    Print(const T &x, const DataFlowGraph &g) : Obj(x), G(g) {}
+    const T &Obj;
+    const DataFlowGraph &G;
+  };
+
+  template <typename T>
+  struct PrintNode : Print<NodeAddr<T>> {
+    PrintNode(const NodeAddr<T> &x, const DataFlowGraph &g)
+      : Print<NodeAddr<T>>(x, g) {}
+  };
+} // namespace rdf
+
+#endif // RDF_GRAPH_H
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp
new file mode 100644
index 000000000000..1d9bd372ff4e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp
@@ -0,0 +1,848 @@
+//===--- RDFLiveness.cpp --------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Computation of the liveness information from the data-flow graph.
+//
+// The main functionality of this code is to compute block live-in
+// information. With the live-in information in place, the placement
+// of kill flags can also be recalculated.
+//
+// The block live-in calculation is based on the ideas from the following
+// publication:
+//
+// Dibyendu Das, Ramakrishna Upadrasta, Benoit Dupont de Dinechin.
+// "Efficient Liveness Computation Using Merge Sets and DJ-Graphs."
+// ACM Transactions on Architecture and Code Optimization, Association for
+// Computing Machinery, 2012, ACM TACO Special Issue on "High-Performance
+// and Embedded Architectures and Compilers", 8 (4),
+// <10.1145/2086696.2086706>. <hal-00647369>
+//
+#include "RDFGraph.h"
+#include "RDFLiveness.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+using namespace rdf;
+
+namespace rdf {
+  template<>
+  raw_ostream &operator<< (raw_ostream &OS, const Print<Liveness::RefMap> &P) {
+    OS << '{';
+    for (auto I : P.Obj) {
+      OS << ' ' << Print<RegisterRef>(I.first, P.G) << '{';
+      for (auto J = I.second.begin(), E = I.second.end(); J != E; ) {
+        OS << Print<NodeId>(*J, P.G);
+        if (++J != E)
+          OS << ',';
+      }
+      OS << '}';
+    }
+    OS << " }";
+    return OS;
+  }
+}
+
+// The order in the returned sequence is the order of reaching defs in the
+// upward traversal: the first def is the closest to the given reference RefA,
+// the next one is further up, and so on.
+// The list ends at a reaching phi def, or when the reference from RefA is
+// covered by the defs in the list (see FullChain).
+// This function provides two modes of operation:
+// (1) Returning the sequence of reaching defs for a particular reference
+// node. This sequence will terminate at the first phi node [1].
+// (2) Returning a partial sequence of reaching defs, where the final goal
+// is to traverse past phi nodes to the actual defs arising from the code
+// itself.
+// In mode (2), the register reference for which the search was started
+// may be different from the reference node RefA, for which this call was
+// made, hence the argument RefRR, which holds the original register.
+// Also, some definitions may have already been encountered in a previous
+// call that will influence register covering. The register references
+// already defined are passed in through DefRRs.
+// In mode (1), the "continuation" considerations do not apply, and the
+// RefRR is the same as the register in RefA, and the set DefRRs is empty.
+//
+// [1] It is possible for multiple phi nodes to be included in the returned
+// sequence:
+//   SubA = phi ...
+//   SubB = phi ...
+//   ...  = SuperAB(rdef:SubA), SuperAB"(rdef:SubB)
+// However, these phi nodes are independent from one another in terms of
+// the data-flow.
+
+NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
+      NodeAddr<RefNode*> RefA, bool FullChain, const RegisterSet &DefRRs) {
+  SetVector<NodeId> DefQ;
+  SetVector<NodeId> Owners;
+
+  // The initial queue should not have reaching defs for shadows. The
+  // whole point of a shadow is that it will have a reaching def that
+  // is not aliased to the reaching defs of the related shadows.
+  NodeId Start = RefA.Id;
+  auto SNA = DFG.addr<RefNode*>(Start);
+  if (NodeId RD = SNA.Addr->getReachingDef())
+    DefQ.insert(RD);
+
+  // Collect all the reaching defs, going up until a phi node is encountered,
+  // or there are no more reaching defs. From this set, the actual set of
+  // reaching defs will be selected.
+  // The traversal upwards must go on until a covering def is encountered.
+  // It is possible that a collection of non-covering (individually) defs
+  // will be sufficient, but keep going until a covering one is found.
+  for (unsigned i = 0; i < DefQ.size(); ++i) {
+    auto TA = DFG.addr<DefNode*>(DefQ[i]);
+    if (TA.Addr->getFlags() & NodeAttrs::PhiRef)
+      continue;
+    // Stop at the covering/overwriting def of the initial register reference.
+    RegisterRef RR = TA.Addr->getRegRef();
+    if (RAI.covers(RR, RefRR)) {
+      uint16_t Flags = TA.Addr->getFlags();
+      if (!(Flags & NodeAttrs::Preserving))
+        continue;
+    }
+    // Get the next level of reaching defs. This will include multiple
+    // reaching defs for shadows.
+    for (auto S : DFG.getRelatedRefs(TA.Addr->getOwner(DFG), TA))
+      if (auto RD = NodeAddr<RefNode*>(S).Addr->getReachingDef())
+        DefQ.insert(RD);
+  }
+
+  // Remove all non-phi defs that are not aliased to RefRR, and collect
+  // the owners of the remaining defs.
+  SetVector<NodeId> Defs;
+  for (auto N : DefQ) {
+    auto TA = DFG.addr<DefNode*>(N);
+    bool IsPhi = TA.Addr->getFlags() & NodeAttrs::PhiRef;
+    if (!IsPhi && !RAI.alias(RefRR, TA.Addr->getRegRef()))
+      continue;
+    Defs.insert(TA.Id);
+    Owners.insert(TA.Addr->getOwner(DFG).Id);
+  }
+
+  // Return the MachineBasicBlock containing a given instruction.
+  auto Block = [this] (NodeAddr<InstrNode*> IA) -> MachineBasicBlock* {
+    if (IA.Addr->getKind() == NodeAttrs::Stmt)
+      return NodeAddr<StmtNode*>(IA).Addr->getCode()->getParent();
+    assert(IA.Addr->getKind() == NodeAttrs::Phi);
+    NodeAddr<PhiNode*> PA = IA;
+    NodeAddr<BlockNode*> BA = PA.Addr->getOwner(DFG);
+    return BA.Addr->getCode();
+  };
+  // Less(A,B) iff instruction A is further down in the dominator tree than B.
+  auto Less = [&Block,this] (NodeId A, NodeId B) -> bool {
+    if (A == B)
+      return false;
+    auto OA = DFG.addr<InstrNode*>(A), OB = DFG.addr<InstrNode*>(B);
+    MachineBasicBlock *BA = Block(OA), *BB = Block(OB);
+    if (BA != BB)
+      return MDT.dominates(BB, BA);
+    // They are in the same block.
+    bool StmtA = OA.Addr->getKind() == NodeAttrs::Stmt;
+    bool StmtB = OB.Addr->getKind() == NodeAttrs::Stmt;
+    if (StmtA) {
+      if (!StmtB)   // OB is a phi and phis dominate statements.
+        return true;
+      auto CA = NodeAddr<StmtNode*>(OA).Addr->getCode();
+      auto CB = NodeAddr<StmtNode*>(OB).Addr->getCode();
+      // The order must be linear, so tie-break such equalities.
+      if (CA == CB)
+        return A < B;
+      return MDT.dominates(CB, CA);
+    } else {
+      // OA is a phi.
+      if (StmtB)
+        return false;
+      // Both are phis. There is no ordering between phis (in terms of
+      // the data-flow), so tie-break this via node id comparison.
+      return A < B;
+    }
+  };
+
+  std::vector<NodeId> Tmp(Owners.begin(), Owners.end());
+  std::sort(Tmp.begin(), Tmp.end(), Less);
+
+  // The vector is a list of instructions, so that defs coming from
+  // the same instruction don't need to be artificially ordered.
+  // Then, when computing the initial segment, and iterating over an
+  // instruction, pick the defs that contribute to the covering (i.e. is
+  // not covered by previously added defs). Check the defs individually,
+  // i.e. first check each def if is covered or not (without adding them
+  // to the tracking set), and then add all the selected ones.
+
+  // The reason for this is this example:
+  // *d1<A>, *d2<B>, ... Assume A and B are aliased (can happen in phi nodes).
+  // *d3<C>              If A \incl BuC, and B \incl AuC, then *d2 would be
+  //                     covered if we added A first, and A would be covered
+  //                     if we added B first.
+
+  NodeList RDefs;
+  RegisterSet RRs = DefRRs;
+
+  auto DefInSet = [&Defs] (NodeAddr<RefNode*> TA) -> bool {
+    return TA.Addr->getKind() == NodeAttrs::Def &&
+           Defs.count(TA.Id);
+  };
+  for (auto T : Tmp) {
+    if (!FullChain && RAI.covers(RRs, RefRR))
+      break;
+    auto TA = DFG.addr<InstrNode*>(T);
+    bool IsPhi = DFG.IsCode<NodeAttrs::Phi>(TA);
+    NodeList Ds;
+    for (NodeAddr<DefNode*> DA : TA.Addr->members_if(DefInSet, DFG)) {
+      auto QR = DA.Addr->getRegRef();
+      // Add phi defs even if they are covered by subsequent defs. This is
+      // for cases where the reached use is not covered by any of the defs
+      // encountered so far: the phi def is needed to expose the liveness
+      // of that use to the entry of the block.
+      // Example:
+      //   phi d1<R3>(,d2,), ...  Phi def d1 is covered by d2.
+      //   d2<R3>(d1,,u3), ...
+      //   ..., u3<D1>(d2)        This use needs to be live on entry.
+      if (FullChain || IsPhi || !RAI.covers(RRs, QR))
+        Ds.push_back(DA);
+    }
+    RDefs.insert(RDefs.end(), Ds.begin(), Ds.end());
+    for (NodeAddr<DefNode*> DA : Ds) {
+      // When collecting a full chain of definitions, do not consider phi
+      // defs to actually define a register.
+      uint16_t Flags = DA.Addr->getFlags();
+      if (!FullChain || !(Flags & NodeAttrs::PhiRef))
+        if (!(Flags & NodeAttrs::Preserving))
+          RRs.insert(DA.Addr->getRegRef());
+    }
+  }
+
+  return RDefs;
+}
+
+
+static const RegisterSet NoRegs;
+
+NodeList Liveness::getAllReachingDefs(NodeAddr<RefNode*> RefA) {
+  return getAllReachingDefs(RefA.Addr->getRegRef(), RefA, false, NoRegs);
+}
+
+
+void Liveness::computePhiInfo() {
+  NodeList Phis;
+  NodeAddr<FuncNode*> FA = DFG.getFunc();
+  auto Blocks = FA.Addr->members(DFG);
+  for (NodeAddr<BlockNode*> BA : Blocks) {
+    auto Ps = BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG);
+    Phis.insert(Phis.end(), Ps.begin(), Ps.end());
+  }
+
+  // phi use -> (map: reaching phi -> set of registers defined in between)
+  std::map<NodeId,std::map<NodeId,RegisterSet>> PhiUp;
+  std::vector<NodeId> PhiUQ;  // Work list of phis for upward propagation.
+
+  // Go over all phis.
+  for (NodeAddr<PhiNode*> PhiA : Phis) {
+    // Go over all defs and collect the reached uses that are non-phi uses
+    // (i.e. the "real uses").
+    auto &RealUses = RealUseMap[PhiA.Id];
+    auto PhiRefs = PhiA.Addr->members(DFG);
+
+    // Have a work queue of defs whose reached uses need to be found.
+    // For each def, add to the queue all reached (non-phi) defs.
+    SetVector<NodeId> DefQ;
+    NodeSet PhiDefs;
+    for (auto R : PhiRefs) {
+      if (!DFG.IsRef<NodeAttrs::Def>(R))
+        continue;
+      DefQ.insert(R.Id);
+      PhiDefs.insert(R.Id);
+    }
+    for (unsigned i = 0; i < DefQ.size(); ++i) {
+      NodeAddr<DefNode*> DA = DFG.addr<DefNode*>(DefQ[i]);
+      NodeId UN = DA.Addr->getReachedUse();
+      while (UN != 0) {
+        NodeAddr<UseNode*> A = DFG.addr<UseNode*>(UN);
+        if (!(A.Addr->getFlags() & NodeAttrs::PhiRef))
+          RealUses[getRestrictedRegRef(A)].insert(A.Id);
+        UN = A.Addr->getSibling();
+      }
+      NodeId DN = DA.Addr->getReachedDef();
+      while (DN != 0) {
+        NodeAddr<DefNode*> A = DFG.addr<DefNode*>(DN);
+        for (auto T : DFG.getRelatedRefs(A.Addr->getOwner(DFG), A)) {
+          uint16_t Flags = NodeAddr<DefNode*>(T).Addr->getFlags();
+          // Must traverse the reached-def chain. Consider:
+          //   def(D0) -> def(R0) -> def(R0) -> use(D0)
+          // The reachable use of D0 passes through a def of R0.
+          if (!(Flags & NodeAttrs::PhiRef))
+            DefQ.insert(T.Id);
+        }
+        DN = A.Addr->getSibling();
+      }
+    }
+    // Filter out these uses that appear to be reachable, but really
+    // are not. For example:
+    //
+    // R1:0 =          d1
+    //      = R1:0     u2     Reached by d1.
+    //   R0 =          d3
+    //      = R1:0     u4     Still reached by d1: indirectly through
+    //                        the def d3.
+    //   R1 =          d5
+    //      = R1:0     u6     Not reached by d1 (covered collectively
+    //                        by d3 and d5), but following reached
+    //                        defs and uses from d1 will lead here.
+    auto HasDef = [&PhiDefs] (NodeAddr<DefNode*> DA) -> bool {
+      return PhiDefs.count(DA.Id);
+    };
+    for (auto UI = RealUses.begin(), UE = RealUses.end(); UI != UE; ) {
+      // For each reached register UI->first, there is a set UI->second, of
+      // uses of it. For each such use, check if it is reached by this phi,
+      // i.e. check if the set of its reaching uses intersects the set of
+      // this phi's defs.
+      auto &Uses = UI->second;
+      for (auto I = Uses.begin(), E = Uses.end(); I != E; ) {
+        auto UA = DFG.addr<UseNode*>(*I);
+        NodeList RDs = getAllReachingDefs(UI->first, UA);
+        if (std::any_of(RDs.begin(), RDs.end(), HasDef))
+          ++I;
+        else
+          I = Uses.erase(I);
+      }
+      if (Uses.empty())
+        UI = RealUses.erase(UI);
+      else
+        ++UI;
+    }
+
+    // If this phi reaches some "real" uses, add it to the queue for upward
+    // propagation.
+    if (!RealUses.empty())
+      PhiUQ.push_back(PhiA.Id);
+
+    // Go over all phi uses and check if the reaching def is another phi.
+    // Collect the phis that are among the reaching defs of these uses.
+    // While traversing the list of reaching defs for each phi use, collect
+    // the set of registers defined between this phi (Phi) and the owner phi
+    // of the reaching def.
+    for (auto I : PhiRefs) {
+      if (!DFG.IsRef<NodeAttrs::Use>(I))
+        continue;
+      NodeAddr<UseNode*> UA = I;
+      auto &UpMap = PhiUp[UA.Id];
+      RegisterSet DefRRs;
+      for (NodeAddr<DefNode*> DA : getAllReachingDefs(UA)) {
+        if (DA.Addr->getFlags() & NodeAttrs::PhiRef)
+          UpMap[DA.Addr->getOwner(DFG).Id] = DefRRs;
+        else
+          DefRRs.insert(DA.Addr->getRegRef());
+      }
+    }
+  }
+
+  if (Trace) {
+    dbgs() << "Phi-up-to-phi map:\n";
+    for (auto I : PhiUp) {
+      dbgs() << "phi " << Print<NodeId>(I.first, DFG) << " -> {";
+      for (auto R : I.second)
+        dbgs() << ' ' << Print<NodeId>(R.first, DFG)
+               << Print<RegisterSet>(R.second, DFG);
+      dbgs() << " }\n";
+    }
+  }
+
+  // Propagate the reached registers up in the phi chain.
+  //
+  // The following type of situation needs careful handling:
+  //
+  //   phi d1<R1:0>  (1)
+  //        |
+  //   ... d2<R1>
+  //        |
+  //   phi u3<R1:0>  (2)
+  //        |
+  //   ... u4<R1>
+  //
+  // The phi node (2) defines a register pair R1:0, and reaches a "real"
+  // use u4 of just R1. The same phi node is also known to reach (upwards)
+  // the phi node (1). However, the use u4 is not reached by phi (1),
+  // because of the intervening definition d2 of R1. The data flow between
+  // phis (1) and (2) is restricted to R1:0 minus R1, i.e. R0.
+  //
+  // When propagating uses up the phi chains, get the all reaching defs
+  // for a given phi use, and traverse the list until the propagated ref
+  // is covered, or until or until reaching the final phi. Only assume
+  // that the reference reaches the phi in the latter case.
+
+  for (unsigned i = 0; i < PhiUQ.size(); ++i) {
+    auto PA = DFG.addr<PhiNode*>(PhiUQ[i]);
+    auto &RealUses = RealUseMap[PA.Id];
+    for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) {
+      NodeAddr<UseNode*> UA = U;
+      auto &UpPhis = PhiUp[UA.Id];
+      for (auto UP : UpPhis) {
+        bool Changed = false;
+        auto &MidDefs = UP.second;
+        // Collect the set UpReached of uses that are reached by the current
+        // phi PA, and are not covered by any intervening def between PA and
+        // the upward phi UP.
+        RegisterSet UpReached;
+        for (auto T : RealUses) {
+          if (!isRestricted(PA, UA, T.first))
+            continue;
+          if (!RAI.covers(MidDefs, T.first))
+            UpReached.insert(T.first);
+        }
+        if (UpReached.empty())
+          continue;
+        // Update the set PRUs of real uses reached by the upward phi UP with
+        // the actual set of uses (UpReached) that the UP phi reaches.
+        auto &PRUs = RealUseMap[UP.first];
+        for (auto R : UpReached) {
+          unsigned Z = PRUs[R].size();
+          PRUs[R].insert(RealUses[R].begin(), RealUses[R].end());
+          Changed |= (PRUs[R].size() != Z);
+        }
+        if (Changed)
+          PhiUQ.push_back(UP.first);
+      }
+    }
+  }
+
+  if (Trace) {
+    dbgs() << "Real use map:\n";
+    for (auto I : RealUseMap) {
+      dbgs() << "phi " << Print<NodeId>(I.first, DFG);
+      NodeAddr<PhiNode*> PA = DFG.addr<PhiNode*>(I.first);
+      NodeList Ds = PA.Addr->members_if(DFG.IsRef<NodeAttrs::Def>, DFG);
+      if (!Ds.empty()) {
+        RegisterRef RR = NodeAddr<DefNode*>(Ds[0]).Addr->getRegRef();
+        dbgs() << '<' << Print<RegisterRef>(RR, DFG) << '>';
+      } else {
+        dbgs() << "<noreg>";
+      }
+      dbgs() << " -> " << Print<RefMap>(I.second, DFG) << '\n';
+    }
+  }
+}
+
+
+void Liveness::computeLiveIns() {
+  // Populate the node-to-block map. This speeds up the calculations
+  // significantly.
+  NBMap.clear();
+  for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) {
+    MachineBasicBlock *BB = BA.Addr->getCode();
+    for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) {
+      for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG))
+        NBMap.insert(std::make_pair(RA.Id, BB));
+      NBMap.insert(std::make_pair(IA.Id, BB));
+    }
+  }
+
+  MachineFunction &MF = DFG.getMF();
+
+  // Compute IDF first, then the inverse.
+  decltype(IIDF) IDF;
+  for (auto &B : MF) {
+    auto F1 = MDF.find(&B);
+    if (F1 == MDF.end())
+      continue;
+    SetVector<MachineBasicBlock*> IDFB(F1->second.begin(), F1->second.end());
+    for (unsigned i = 0; i < IDFB.size(); ++i) {
+      auto F2 = MDF.find(IDFB[i]);
+      if (F2 != MDF.end())
+        IDFB.insert(F2->second.begin(), F2->second.end());
+    }
+    // Add B to the IDF(B). This will put B in the IIDF(B).
+    IDFB.insert(&B);
+    IDF[&B].insert(IDFB.begin(), IDFB.end());
+  }
+
+  for (auto I : IDF)
+    for (auto S : I.second)
+      IIDF[S].insert(I.first);
+
+  computePhiInfo();
+
+  NodeAddr<FuncNode*> FA = DFG.getFunc();
+  auto Blocks = FA.Addr->members(DFG);
+
+  // Build the phi live-on-entry map.
+  for (NodeAddr<BlockNode*> BA : Blocks) {
+    MachineBasicBlock *MB = BA.Addr->getCode();
+    auto &LON = PhiLON[MB];
+    for (auto P : BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG))
+      for (auto S : RealUseMap[P.Id])
+        LON[S.first].insert(S.second.begin(), S.second.end());
+  }
+
+  if (Trace) {
+    dbgs() << "Phi live-on-entry map:\n";
+    for (auto I : PhiLON)
+      dbgs() << "block #" << I.first->getNumber() << " -> "
+             << Print<RefMap>(I.second, DFG) << '\n';
+  }
+
+  // Build the phi live-on-exit map. Each phi node has some set of reached
+  // "real" uses. Propagate this set backwards into the block predecessors
+  // through the reaching defs of the corresponding phi uses.
+  for (NodeAddr<BlockNode*> BA : Blocks) {
+    auto Phis = BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG);
+    for (NodeAddr<PhiNode*> PA : Phis) {
+      auto &RUs = RealUseMap[PA.Id];
+      if (RUs.empty())
+        continue;
+
+      for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) {
+        NodeAddr<PhiUseNode*> UA = U;
+        if (UA.Addr->getReachingDef() == 0)
+          continue;
+
+        // Mark all reached "real" uses of P as live on exit in the
+        // predecessor.
+        // Remap all the RUs so that they have a correct reaching def.
+        auto PrA = DFG.addr<BlockNode*>(UA.Addr->getPredecessor());
+        auto &LOX = PhiLOX[PrA.Addr->getCode()];
+        for (auto R : RUs) {
+          RegisterRef RR = R.first;
+          if (!isRestricted(PA, UA, RR))
+            RR = getRestrictedRegRef(UA);
+          // The restricted ref may be different from the ref that was
+          // accessed in the "real use". This means that this phi use
+          // is not the one that carries this reference, so skip it.
+          if (!RAI.alias(R.first, RR))
+            continue;
+          for (auto D : getAllReachingDefs(RR, UA))
+            LOX[RR].insert(D.Id);
+        }
+      }  // for U : phi uses
+    }  // for P : Phis
+  }  // for B : Blocks
+
+  if (Trace) {
+    dbgs() << "Phi live-on-exit map:\n";
+    for (auto I : PhiLOX)
+      dbgs() << "block #" << I.first->getNumber() << " -> "
+             << Print<RefMap>(I.second, DFG) << '\n';
+  }
+
+  RefMap LiveIn;
+  traverse(&MF.front(), LiveIn);
+
+  // Add function live-ins to the live-in set of the function entry block.
+  auto &EntryIn = LiveMap[&MF.front()];
+  for (auto I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I)
+    EntryIn.insert({I->first,0});
+
+  if (Trace) {
+    // Dump the liveness map
+    for (auto &B : MF) {
+      BitVector LV(TRI.getNumRegs());
+      for (auto I = B.livein_begin(), E = B.livein_end(); I != E; ++I)
+        LV.set(I->PhysReg);
+      dbgs() << "BB#" << B.getNumber() << "\t rec = {";
+      for (int x = LV.find_first(); x >= 0; x = LV.find_next(x))
+        dbgs() << ' ' << Print<RegisterRef>({unsigned(x),0}, DFG);
+      dbgs() << " }\n";
+      dbgs() << "\tcomp = " << Print<RegisterSet>(LiveMap[&B], DFG) << '\n';
+    }
+  }
+}
+
+
+void Liveness::resetLiveIns() {
+  for (auto &B : DFG.getMF()) {
+    // Remove all live-ins.
+    std::vector<unsigned> T;
+    for (auto I = B.livein_begin(), E = B.livein_end(); I != E; ++I)
+      T.push_back(I->PhysReg);
+    for (auto I : T)
+      B.removeLiveIn(I);
+    // Add the newly computed live-ins.
+    auto &LiveIns = LiveMap[&B];
+    for (auto I : LiveIns) {
+      assert(I.Sub == 0);
+      B.addLiveIn(I.Reg);
+    }
+  }
+}
+
+
+void Liveness::resetKills() {
+  for (auto &B : DFG.getMF())
+    resetKills(&B);
+}
+
+
+void Liveness::resetKills(MachineBasicBlock *B) {
+  auto CopyLiveIns = [] (MachineBasicBlock *B, BitVector &LV) -> void {
+    for (auto I = B->livein_begin(), E = B->livein_end(); I != E; ++I)
+      LV.set(I->PhysReg);
+  };
+
+  BitVector LiveIn(TRI.getNumRegs()), Live(TRI.getNumRegs());
+  CopyLiveIns(B, LiveIn);
+  for (auto SI : B->successors())
+    CopyLiveIns(SI, Live);
+
+  for (auto I = B->rbegin(), E = B->rend(); I != E; ++I) {
+    MachineInstr *MI = &*I;
+    if (MI->isDebugValue())
+      continue;
+
+    MI->clearKillInfo();
+    for (auto &Op : MI->operands()) {
+      if (!Op.isReg() || !Op.isDef())
+        continue;
+      unsigned R = Op.getReg();
+      if (!TargetRegisterInfo::isPhysicalRegister(R))
+        continue;
+      for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR)
+        Live.reset(*SR);
+    }
+    for (auto &Op : MI->operands()) {
+      if (!Op.isReg() || !Op.isUse())
+        continue;
+      unsigned R = Op.getReg();
+      if (!TargetRegisterInfo::isPhysicalRegister(R))
+        continue;
+      bool IsLive = false;
+      for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR) {
+        if (!Live[*SR])
+          continue;
+        IsLive = true;
+        break;
+      }
+      if (IsLive)
+        continue;
+      Op.setIsKill(true);
+      for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR)
+        Live.set(*SR);
+    }
+  }
+}
+
+
+// For shadows, determine if RR is aliased to a reaching def of any other
+// shadow associated with RA. If it is not, then RR is "restricted" to RA,
+// and so it can be considered a value specific to RA. This is important
+// for accurately determining values associated with phi uses.
+// For non-shadows, this function returns "true".
+bool Liveness::isRestricted(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA,
+      RegisterRef RR) const {
+  NodeId Start = RA.Id;
+  for (NodeAddr<RefNode*> TA = DFG.getNextShadow(IA, RA);
+       TA.Id != 0 && TA.Id != Start; TA = DFG.getNextShadow(IA, TA)) {
+    NodeId RD = TA.Addr->getReachingDef();
+    if (RD == 0)
+      continue;
+    if (RAI.alias(RR, DFG.addr<DefNode*>(RD).Addr->getRegRef()))
+      return false;
+  }
+  return true;
+}
+
+
+RegisterRef Liveness::getRestrictedRegRef(NodeAddr<RefNode*> RA) const {
+  assert(DFG.IsRef<NodeAttrs::Use>(RA));
+  if (RA.Addr->getFlags() & NodeAttrs::Shadow) {
+    NodeId RD = RA.Addr->getReachingDef();
+    assert(RD);
+    RA = DFG.addr<DefNode*>(RD);
+  }
+  return RA.Addr->getRegRef();
+}
+
+
+unsigned Liveness::getPhysReg(RegisterRef RR) const {
+  if (!TargetRegisterInfo::isPhysicalRegister(RR.Reg))
+    return 0;
+  return RR.Sub ? TRI.getSubReg(RR.Reg, RR.Sub) : RR.Reg;
+}
+
+
+// Helper function to obtain the basic block containing the reaching def
+// of the given use.
+MachineBasicBlock *Liveness::getBlockWithRef(NodeId RN) const {
+  auto F = NBMap.find(RN);
+  if (F != NBMap.end())
+    return F->second;
+  llvm_unreachable("Node id not in map");
+}
+
+
+void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) {
+  // The LiveIn map, for each (physical) register, contains the set of live
+  // reaching defs of that register that are live on entry to the associated
+  // block.
+
+  // The summary of the traversal algorithm:
+  //
+  // R is live-in in B, if there exists a U(R), such that rdef(R) dom B
+  // and (U \in IDF(B) or B dom U).
+  //
+  // for (C : children) {
+  //   LU = {}
+  //   traverse(C, LU)
+  //   LiveUses += LU
+  // }
+  //
+  // LiveUses -= Defs(B);
+  // LiveUses += UpwardExposedUses(B);
+  // for (C : IIDF[B])
+  //   for (U : LiveUses)
+  //     if (Rdef(U) dom C)
+  //       C.addLiveIn(U)
+  //
+
+  // Go up the dominator tree (depth-first).
+  MachineDomTreeNode *N = MDT.getNode(B);
+  for (auto I : *N) {
+    RefMap L;
+    MachineBasicBlock *SB = I->getBlock();
+    traverse(SB, L);
+
+    for (auto S : L)
+      LiveIn[S.first].insert(S.second.begin(), S.second.end());
+  }
+
+  if (Trace) {
+    dbgs() << LLVM_FUNCTION_NAME << " in BB#" << B->getNumber()
+           << " after recursion into";
+    for (auto I : *N)
+      dbgs() << ' ' << I->getBlock()->getNumber();
+    dbgs() << "\n  LiveIn: " << Print<RefMap>(LiveIn, DFG);
+    dbgs() << "\n  Local:  " << Print<RegisterSet>(LiveMap[B], DFG) << '\n';
+  }
+
+  // Add phi uses that are live on exit from this block.
+  RefMap &PUs = PhiLOX[B];
+  for (auto S : PUs)
+    LiveIn[S.first].insert(S.second.begin(), S.second.end());
+
+  if (Trace) {
+    dbgs() << "after LOX\n";
+    dbgs() << "  LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
+    dbgs() << "  Local:  " << Print<RegisterSet>(LiveMap[B], DFG) << '\n';
+  }
+
+  // Stop tracking all uses defined in this block: erase those records
+  // where the reaching def is located in B and which cover all reached
+  // uses.
+  auto Copy = LiveIn;
+  LiveIn.clear();
+
+  for (auto I : Copy) {
+    auto &Defs = LiveIn[I.first];
+    NodeSet Rest;
+    for (auto R : I.second) {
+      auto DA = DFG.addr<DefNode*>(R);
+      RegisterRef DDR = DA.Addr->getRegRef();
+      NodeAddr<InstrNode*> IA = DA.Addr->getOwner(DFG);
+      NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG);
+      // Defs from a different block need to be preserved. Defs from this
+      // block will need to be processed further, except for phi defs, the
+      // liveness of which is handled through the PhiLON/PhiLOX maps.
+      if (B != BA.Addr->getCode())
+        Defs.insert(R);
+      else {
+        bool IsPreserving = DA.Addr->getFlags() & NodeAttrs::Preserving;
+        if (IA.Addr->getKind() != NodeAttrs::Phi && !IsPreserving) {
+          bool Covering = RAI.covers(DDR, I.first);
+          NodeId U = DA.Addr->getReachedUse();
+          while (U && Covering) {
+            auto DUA = DFG.addr<UseNode*>(U);
+            RegisterRef Q = DUA.Addr->getRegRef();
+            Covering = RAI.covers(DA.Addr->getRegRef(), Q);
+            U = DUA.Addr->getSibling();
+          }
+          if (!Covering)
+            Rest.insert(R);
+        }
+      }
+    }
+
+    // Non-covering defs from B.
+    for (auto R : Rest) {
+      auto DA = DFG.addr<DefNode*>(R);
+      RegisterRef DRR = DA.Addr->getRegRef();
+      RegisterSet RRs;
+      for (NodeAddr<DefNode*> TA : getAllReachingDefs(DA)) {
+        NodeAddr<InstrNode*> IA = TA.Addr->getOwner(DFG);
+        NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG);
+        // Preserving defs do not count towards covering.
+        if (!(TA.Addr->getFlags() & NodeAttrs::Preserving))
+          RRs.insert(TA.Addr->getRegRef());
+        if (BA.Addr->getCode() == B)
+          continue;
+        if (RAI.covers(RRs, DRR))
+          break;
+        Defs.insert(TA.Id);
+      }
+    }
+  }
+
+  emptify(LiveIn);
+
+  if (Trace) {
+    dbgs() << "after defs in block\n";
+    dbgs() << "  LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
+    dbgs() << "  Local:  " << Print<RegisterSet>(LiveMap[B], DFG) << '\n';
+  }
+
+  // Scan the block for upward-exposed uses and add them to the tracking set.
+  for (auto I : DFG.getFunc().Addr->findBlock(B, DFG).Addr->members(DFG)) {
+    NodeAddr<InstrNode*> IA = I;
+    if (IA.Addr->getKind() != NodeAttrs::Stmt)
+      continue;
+    for (NodeAddr<UseNode*> UA : IA.Addr->members_if(DFG.IsUse, DFG)) {
+      RegisterRef RR = UA.Addr->getRegRef();
+      for (auto D : getAllReachingDefs(UA))
+        if (getBlockWithRef(D.Id) != B)
+          LiveIn[RR].insert(D.Id);
+    }
+  }
+
+  if (Trace) {
+    dbgs() << "after uses in block\n";
+    dbgs() << "  LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
+    dbgs() << "  Local:  " << Print<RegisterSet>(LiveMap[B], DFG) << '\n';
+  }
+
+  // Phi uses should not be propagated up the dominator tree, since they
+  // are not dominated by their corresponding reaching defs.
+  auto &Local = LiveMap[B];
+  auto &LON = PhiLON[B];
+  for (auto R : LON)
+    Local.insert(R.first);
+
+  if (Trace) {
+    dbgs() << "after phi uses in block\n";
+    dbgs() << "  LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
+    dbgs() << "  Local:  " << Print<RegisterSet>(Local, DFG) << '\n';
+  }
+
+  for (auto C : IIDF[B]) {
+    auto &LiveC = LiveMap[C];
+    for (auto S : LiveIn)
+      for (auto R : S.second)
+        if (MDT.properlyDominates(getBlockWithRef(R), C))
+          LiveC.insert(S.first);
+  }
+}
+
+
+void Liveness::emptify(RefMap &M) {
+  for (auto I = M.begin(), E = M.end(); I != E; )
+    I = I->second.empty() ? M.erase(I) : std::next(I);
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h
new file mode 100644
index 000000000000..4c1e8f3ee838
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h
@@ -0,0 +1,106 @@
+//===--- RDFLiveness.h ----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Recalculate the liveness information given a data flow graph.
+// This includes block live-ins and kill flags.
+
+#ifndef RDF_LIVENESS_H
+#define RDF_LIVENESS_H
+
+#include "RDFGraph.h"
+#include "llvm/ADT/DenseMap.h"
+#include <map>
+
+using namespace llvm;
+
+namespace llvm {
+  class MachineBasicBlock;
+  class MachineFunction;
+  class MachineRegisterInfo;
+  class TargetRegisterInfo;
+  class MachineDominatorTree;
+  class MachineDominanceFrontier;
+}
+
+namespace rdf {
+  struct Liveness {
+  public:
+    typedef std::map<MachineBasicBlock*,RegisterSet> LiveMapType;
+    typedef std::map<RegisterRef,NodeSet> RefMap;
+
+    Liveness(MachineRegisterInfo &mri, const DataFlowGraph &g)
+      : DFG(g), TRI(g.getTRI()), MDT(g.getDT()), MDF(g.getDF()),
+        RAI(g.getRAI()), MRI(mri), Empty(), Trace(false) {}
+
+    NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
+        bool FullChain = false, const RegisterSet &DefRRs = RegisterSet());
+    NodeList getAllReachingDefs(NodeAddr<RefNode*> RefA);
+
+    LiveMapType &getLiveMap() { return LiveMap; }
+    const LiveMapType &getLiveMap() const { return LiveMap; }
+    const RefMap &getRealUses(NodeId P) const {
+      auto F = RealUseMap.find(P);
+      return F == RealUseMap.end() ? Empty : F->second;
+    }
+
+    void computePhiInfo();
+    void computeLiveIns();
+    void resetLiveIns();
+    void resetKills();
+    void resetKills(MachineBasicBlock *B);
+
+    void trace(bool T) { Trace = T; }
+
+  private:
+    const DataFlowGraph &DFG;
+    const TargetRegisterInfo &TRI;
+    const MachineDominatorTree &MDT;
+    const MachineDominanceFrontier &MDF;
+    const RegisterAliasInfo &RAI;
+    MachineRegisterInfo &MRI;
+    LiveMapType LiveMap;
+    const RefMap Empty;
+    bool Trace;
+
+    // Cache of mapping from node ids (for RefNodes) to the containing
+    // basic blocks. Not computing it each time for each node reduces
+    // the liveness calculation time by a large fraction.
+    typedef DenseMap<NodeId,MachineBasicBlock*> NodeBlockMap;
+    NodeBlockMap NBMap;
+
+    // Phi information:
+    //
+    // map: NodeId -> (map: RegisterRef -> NodeSet)
+    //      phi id -> (map: register -> set of reached non-phi uses)
+    std::map<NodeId, RefMap> RealUseMap;
+
+    // Inverse iterated dominance frontier.
+    std::map<MachineBasicBlock*,std::set<MachineBasicBlock*>> IIDF;
+
+    // Live on entry.
+    std::map<MachineBasicBlock*,RefMap> PhiLON;
+
+    // Phi uses are considered to be located at the end of the block that
+    // they are associated with. The reaching def of a phi use dominates the
+    // block that the use corresponds to, but not the block that contains
+    // the phi itself. To include these uses in the liveness propagation (up
+    // the dominator tree), create a map: block -> set of uses live on exit.
+    std::map<MachineBasicBlock*,RefMap> PhiLOX;
+
+    bool isRestricted(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA,
+        RegisterRef RR) const;
+    RegisterRef getRestrictedRegRef(NodeAddr<RefNode*> RA) const;
+    unsigned getPhysReg(RegisterRef RR) const;
+    MachineBasicBlock *getBlockWithRef(NodeId RN) const;
+    void traverse(MachineBasicBlock *B, RefMap &LiveIn);
+    void emptify(RefMap &M);
+  };
+}
+
+#endif // RDF_LIVENESS_H
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 6756c1702f76..5680130b91b2 100644
--- a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -277,8 +277,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::SELECT,             MVT::f32,   Custom);
   setOperationAction(ISD::SELECT,             MVT::f64,   Custom);
   setOperationAction(ISD::SELECT,             MVT::i32,   Custom);
-  setOperationAction(ISD::SELECT_CC,          MVT::f32,   Custom);
-  setOperationAction(ISD::SELECT_CC,          MVT::f64,   Custom);
   setOperationAction(ISD::SETCC,              MVT::f32,   Custom);
   setOperationAction(ISD::SETCC,              MVT::f64,   Custom);
   setOperationAction(ISD::BRCOND,             MVT::Other, Custom);
@@ -327,6 +325,8 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::BR_CC,             MVT::i64,   Expand);
   setOperationAction(ISD::SELECT_CC,         MVT::i32,   Expand);
   setOperationAction(ISD::SELECT_CC,         MVT::i64,   Expand);
+  setOperationAction(ISD::SELECT_CC,         MVT::f32,   Expand);
+  setOperationAction(ISD::SELECT_CC,         MVT::f64,   Expand);
   setOperationAction(ISD::UINT_TO_FP,        MVT::i32,   Expand);
   setOperationAction(ISD::UINT_TO_FP,        MVT::i64,   Expand);
   setOperationAction(ISD::FP_TO_UINT,        MVT::i32,   Expand);
@@ -872,7 +872,6 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
   case ISD::GlobalTLSAddress:   return lowerGlobalTLSAddress(Op, DAG);
   case ISD::JumpTable:          return lowerJumpTable(Op, DAG);
   case ISD::SELECT:             return lowerSELECT(Op, DAG);
-  case ISD::SELECT_CC:          return lowerSELECT_CC(Op, DAG);
   case ISD::SETCC:              return lowerSETCC(Op, DAG);
   case ISD::VASTART:            return lowerVASTART(Op, DAG);
   case ISD::VAARG:              return lowerVAARG(Op, DAG);
@@ -1648,20 +1647,6 @@ lowerSELECT(SDValue Op, SelectionDAG &DAG) const
                       SDLoc(Op));
 }
 
-SDValue MipsTargetLowering::
-lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
-{
-  SDLoc DL(Op);
-  EVT Ty = Op.getOperand(0).getValueType();
-  SDValue Cond =
-      DAG.getNode(ISD::SETCC, DL, getSetCCResultType(DAG.getDataLayout(),
-                                                     *DAG.getContext(), Ty),
-                  Op.getOperand(0), Op.getOperand(1), Op.getOperand(4));
-
-  return DAG.getNode(ISD::SELECT, DL, Op.getValueType(), Cond, Op.getOperand(2),
-                     Op.getOperand(3));
-}
-
 SDValue MipsTargetLowering::lowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   assert(!Subtarget.hasMips32r6() && !Subtarget.hasMips64r6());
   SDValue Cond = createFPCmp(DAG, Op);
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h
index b33e125b81b7..0dc683e3df27 100644
--- a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h
+++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h
@@ -430,7 +430,6 @@ namespace llvm {
     SDValue lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerVAARG(SDValue Op, SelectionDAG &DAG) const;
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td
index d9fb8c890739..ffda491f0c86 100644
--- a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td
@@ -1003,7 +1003,7 @@ class IndirectBranch<string opstr, RegisterOperand RO> : JumpFR<opstr, RO> {
 let isCall=1, hasDelaySlot=1, Defs = [RA] in {
   class JumpLink<string opstr, DAGOperand opnd> :
     InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
-           [(MipsJmpLink imm:$target)], II_JAL, FrmJ, opstr> {
+           [(MipsJmpLink tglobaladdr:$target)], II_JAL, FrmJ, opstr> {
     let DecoderMethod = "DecodeJumpTarget";
   }
 
@@ -2075,8 +2075,6 @@ def : MipsPat<(MipsSync (i32 immz)),
               (SYNC 0)>, ISA_MIPS2;
 
 // Call
-def : MipsPat<(MipsJmpLink (i32 tglobaladdr:$dst)),
-              (JAL tglobaladdr:$dst)>;
 def : MipsPat<(MipsJmpLink (i32 texternalsym:$dst)),
               (JAL texternalsym:$dst)>;
 //def : MipsPat<(MipsJmpLink GPR32:$dst),
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
index e6f7fe9aae1d..d4aeaf928655 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -544,8 +544,6 @@ void MipsSEInstrInfo::expandPseudoMTLoHi(MachineBasicBlock &MBB,
   const MachineOperand &SrcLo = I->getOperand(1), &SrcHi = I->getOperand(2);
   MachineInstrBuilder LoInst = BuildMI(MBB, I, DL, get(LoOpc));
   MachineInstrBuilder HiInst = BuildMI(MBB, I, DL, get(HiOpc));
-  LoInst.addReg(SrcLo.getReg(), getKillRegState(SrcLo.isKill()));
-  HiInst.addReg(SrcHi.getReg(), getKillRegState(SrcHi.isKill()));
 
   // Add lo/hi registers if the mtlo/hi instructions created have explicit
   // def registers.
@@ -556,6 +554,9 @@ void MipsSEInstrInfo::expandPseudoMTLoHi(MachineBasicBlock &MBB,
     LoInst.addReg(DstLo, RegState::Define);
     HiInst.addReg(DstHi, RegState::Define);
   }
+
+  LoInst.addReg(SrcLo.getReg(), getKillRegState(SrcLo.isKill()));
+  HiInst.addReg(SrcHi.getReg(), getKillRegState(SrcHi.isKill()));
 }
 
 void MipsSEInstrInfo::expandCvtFPInt(MachineBasicBlock &MBB,
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 766369631e14..be735f6c1bce 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -4549,6 +4549,7 @@ NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {
   delete static_cast<NVPTXSection *>(DwarfLocSection);
   delete static_cast<NVPTXSection *>(DwarfARangesSection);
   delete static_cast<NVPTXSection *>(DwarfRangesSection);
+  delete static_cast<NVPTXSection *>(DwarfMacinfoSection);
 }
 
 MCSection *
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
index 0f88ddfaa934..683b9a3f49f7 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -41,6 +41,7 @@ public:
     DwarfLocSection = nullptr;
     DwarfARangesSection = nullptr;
     DwarfRangesSection = nullptr;
+    DwarfMacinfoSection = nullptr;
   }
 
   virtual ~NVPTXTargetObjectFile();
@@ -81,6 +82,8 @@ public:
         new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
     DwarfRangesSection =
         new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfMacinfoSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
   }
 
   MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 9a63c14b5053..ec354c209ca0 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1092,8 +1092,28 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
   }
 
   // ELFv2 ABI - Normal entry label.
-  if (Subtarget->isELFv2ABI())
+  if (Subtarget->isELFv2ABI()) {
+    // In the Large code model, we allow arbitrary displacements between
+    // the text section and its associated TOC section.  We place the
+    // full 8-byte offset to the TOC in memory immediatedly preceding
+    // the function global entry point.
+    if (TM.getCodeModel() == CodeModel::Large
+        && !MF->getRegInfo().use_empty(PPC::X2)) {
+      const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
+
+      MCSymbol *TOCSymbol = OutContext.getOrCreateSymbol(StringRef(".TOC."));
+      MCSymbol *GlobalEPSymbol = PPCFI->getGlobalEPSymbol();
+      const MCExpr *TOCDeltaExpr =
+        MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext),
+                                MCSymbolRefExpr::create(GlobalEPSymbol,
+                                                        OutContext),
+                                OutContext);
+
+      OutStreamer->EmitLabel(PPCFI->getTOCOffsetSymbol());
+      OutStreamer->EmitValue(TOCDeltaExpr, 8);
+    }
     return AsmPrinter::EmitFunctionEntryLabel();
+  }
 
   // Emit an official procedure descriptor.
   MCSectionSubPair Current = OutStreamer->getCurrentSection();
@@ -1160,10 +1180,25 @@ void PPCLinuxAsmPrinter::EmitFunctionBodyStart() {
   // thus emit a prefix sequence along the following lines:
   //
   // func:
+  // .Lfunc_gepNN:
+  //         # global entry point
+  //         addis r2,r12,(.TOC.-.Lfunc_gepNN)@ha
+  //         addi  r2,r2,(.TOC.-.Lfunc_gepNN)@l
+  // .Lfunc_lepNN:
+  //         .localentry func, .Lfunc_lepNN-.Lfunc_gepNN
+  //         # local entry point, followed by function body
+  //
+  // For the Large code model, we create
+  //
+  // .Lfunc_tocNN:
+  //         .quad .TOC.-.Lfunc_gepNN      # done by EmitFunctionEntryLabel
+  // func:
+  // .Lfunc_gepNN:
   //         # global entry point
-  //         addis r2,r12,(.TOC.-func)@ha
-  //         addi  r2,r2,(.TOC.-func)@l
-  //         .localentry func, .-func
+  //         ld    r2,.Lfunc_tocNN-.Lfunc_gepNN(r12)
+  //         add   r2,r2,r12
+  // .Lfunc_lepNN:
+  //         .localentry func, .Lfunc_lepNN-.Lfunc_gepNN
   //         # local entry point, followed by function body
   //
   // This ensures we have r2 set up correctly while executing the function
@@ -1171,32 +1206,49 @@ void PPCLinuxAsmPrinter::EmitFunctionBodyStart() {
   if (Subtarget->isELFv2ABI()
       // Only do all that if the function uses r2 in the first place.
       && !MF->getRegInfo().use_empty(PPC::X2)) {
+    const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
 
-    MCSymbol *GlobalEntryLabel = OutContext.createTempSymbol();
+    MCSymbol *GlobalEntryLabel = PPCFI->getGlobalEPSymbol();
     OutStreamer->EmitLabel(GlobalEntryLabel);
     const MCSymbolRefExpr *GlobalEntryLabelExp =
       MCSymbolRefExpr::create(GlobalEntryLabel, OutContext);
 
-    MCSymbol *TOCSymbol = OutContext.getOrCreateSymbol(StringRef(".TOC."));
-    const MCExpr *TOCDeltaExpr =
-      MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext),
-                              GlobalEntryLabelExp, OutContext);
+    if (TM.getCodeModel() != CodeModel::Large) {
+      MCSymbol *TOCSymbol = OutContext.getOrCreateSymbol(StringRef(".TOC."));
+      const MCExpr *TOCDeltaExpr =
+        MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext),
+                                GlobalEntryLabelExp, OutContext);
 
-    const MCExpr *TOCDeltaHi =
-      PPCMCExpr::createHa(TOCDeltaExpr, false, OutContext);
-    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS)
-                                 .addReg(PPC::X2)
-                                 .addReg(PPC::X12)
-                                 .addExpr(TOCDeltaHi));
-
-    const MCExpr *TOCDeltaLo =
-      PPCMCExpr::createLo(TOCDeltaExpr, false, OutContext);
-    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDI)
-                                 .addReg(PPC::X2)
-                                 .addReg(PPC::X2)
-                                 .addExpr(TOCDeltaLo));
-
-    MCSymbol *LocalEntryLabel = OutContext.createTempSymbol();
+      const MCExpr *TOCDeltaHi =
+        PPCMCExpr::createHa(TOCDeltaExpr, false, OutContext);
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS)
+                                   .addReg(PPC::X2)
+                                   .addReg(PPC::X12)
+                                   .addExpr(TOCDeltaHi));
+
+      const MCExpr *TOCDeltaLo =
+        PPCMCExpr::createLo(TOCDeltaExpr, false, OutContext);
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDI)
+                                   .addReg(PPC::X2)
+                                   .addReg(PPC::X2)
+                                   .addExpr(TOCDeltaLo));
+    } else {
+      MCSymbol *TOCOffset = PPCFI->getTOCOffsetSymbol();
+      const MCExpr *TOCOffsetDeltaExpr =
+        MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCOffset, OutContext),
+                                GlobalEntryLabelExp, OutContext);
+
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD)
+                                   .addReg(PPC::X2)
+                                   .addExpr(TOCOffsetDeltaExpr)
+                                   .addReg(PPC::X12));
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADD8)
+                                   .addReg(PPC::X2)
+                                   .addReg(PPC::X2)
+                                   .addReg(PPC::X12));
+    }
+
+    MCSymbol *LocalEntryLabel = PPCFI->getLocalEPSymbol();
     OutStreamer->EmitLabel(LocalEntryLabel);
     const MCSymbolRefExpr *LocalEntryLabelExp =
        MCSymbolRefExpr::create(LocalEntryLabel, OutContext);
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index 075e093e41a1..79e4fe379c2d 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -299,22 +299,35 @@ def : Pat<(PPCtc_return CTRRC8:$dst, imm:$imm),
 // 64-bit CR instructions
 let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
 let hasSideEffects = 0 in {
+// mtocrf's input needs to be prepared by shifting by an amount dependent
+// on the cr register selected. Thus, post-ra anti-dep breaking must not
+// later change that register assignment.
+let hasExtraDefRegAllocReq = 1 in {
 def MTOCRF8: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins g8rc:$ST),
                         "mtocrf $FXM, $ST", IIC_BrMCRX>,
             PPC970_DGroup_First, PPC970_Unit_CRU;
 
+// Similarly to mtocrf, the mask for mtcrf must be prepared in a way that
+// is dependent on the cr fields being set.
 def MTCRF8 : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, g8rc:$rS),
                       "mtcrf $FXM, $rS", IIC_BrMCRX>,
             PPC970_MicroCode, PPC970_Unit_CRU;
+} // hasExtraDefRegAllocReq = 1
 
-let hasExtraSrcRegAllocReq = 1 in // to enable post-ra anti-dep breaking.
+// mfocrf's input needs to be prepared by shifting by an amount dependent
+// on the cr register selected. Thus, post-ra anti-dep breaking must not
+// later change that register assignment.
+let hasExtraSrcRegAllocReq = 1 in {
 def MFOCRF8: XFXForm_5a<31, 19, (outs g8rc:$rT), (ins crbitm:$FXM),
                         "mfocrf $rT, $FXM", IIC_SprMFCRF>,
              PPC970_DGroup_First, PPC970_Unit_CRU;
 
+// Similarly to mfocrf, the mask for mfcrf must be prepared in a way that
+// is dependent on the cr fields being copied.
 def MFCR8 : XFXForm_3<31, 19, (outs g8rc:$rT), (ins),
                      "mfcr $rT", IIC_SprMFCR>,
                      PPC970_MicroCode, PPC970_Unit_CRU;
+} // hasExtraSrcRegAllocReq = 1
 } // hasSideEffects = 0
 
 let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index c17603a7718a..dcff6ad2486f 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -744,20 +744,43 @@ void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB,
          "isel is for regular integer GPRs only");
 
   unsigned OpCode = Is64Bit ? PPC::ISEL8 : PPC::ISEL;
-  unsigned SelectPred = Cond[0].getImm();
+  auto SelectPred = static_cast<PPC::Predicate>(Cond[0].getImm());
 
   unsigned SubIdx;
   bool SwapOps;
   switch (SelectPred) {
-  default: llvm_unreachable("invalid predicate for isel");
-  case PPC::PRED_EQ: SubIdx = PPC::sub_eq; SwapOps = false; break;
-  case PPC::PRED_NE: SubIdx = PPC::sub_eq; SwapOps = true; break;
-  case PPC::PRED_LT: SubIdx = PPC::sub_lt; SwapOps = false; break;
-  case PPC::PRED_GE: SubIdx = PPC::sub_lt; SwapOps = true; break;
-  case PPC::PRED_GT: SubIdx = PPC::sub_gt; SwapOps = false; break;
-  case PPC::PRED_LE: SubIdx = PPC::sub_gt; SwapOps = true; break;
-  case PPC::PRED_UN: SubIdx = PPC::sub_un; SwapOps = false; break;
-  case PPC::PRED_NU: SubIdx = PPC::sub_un; SwapOps = true; break;
+  case PPC::PRED_EQ:
+  case PPC::PRED_EQ_MINUS:
+  case PPC::PRED_EQ_PLUS:
+      SubIdx = PPC::sub_eq; SwapOps = false; break;
+  case PPC::PRED_NE:
+  case PPC::PRED_NE_MINUS:
+  case PPC::PRED_NE_PLUS:
+      SubIdx = PPC::sub_eq; SwapOps = true; break;
+  case PPC::PRED_LT:
+  case PPC::PRED_LT_MINUS:
+  case PPC::PRED_LT_PLUS:
+      SubIdx = PPC::sub_lt; SwapOps = false; break;
+  case PPC::PRED_GE:
+  case PPC::PRED_GE_MINUS:
+  case PPC::PRED_GE_PLUS:
+      SubIdx = PPC::sub_lt; SwapOps = true; break;
+  case PPC::PRED_GT:
+  case PPC::PRED_GT_MINUS:
+  case PPC::PRED_GT_PLUS:
+      SubIdx = PPC::sub_gt; SwapOps = false; break;
+  case PPC::PRED_LE:
+  case PPC::PRED_LE_MINUS:
+  case PPC::PRED_LE_PLUS:
+      SubIdx = PPC::sub_gt; SwapOps = true; break;
+  case PPC::PRED_UN:
+  case PPC::PRED_UN_MINUS:
+  case PPC::PRED_UN_PLUS:
+      SubIdx = PPC::sub_un; SwapOps = false; break;
+  case PPC::PRED_NU:
+  case PPC::PRED_NU_MINUS:
+  case PPC::PRED_NU_PLUS:
+      SubIdx = PPC::sub_un; SwapOps = true; break;
   case PPC::PRED_BIT_SET:   SubIdx = 0; SwapOps = false; break;
   case PPC::PRED_BIT_UNSET: SubIdx = 0; SwapOps = true; break;
   }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 6c4364aad331..ce0f9e6f52a7 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -2299,22 +2299,35 @@ def RESTORE_VRSAVE : Pseudo<(outs VRSAVERC:$vrsave), (ins memri:$F),
                      "#RESTORE_VRSAVE", []>;
 
 let hasSideEffects = 0 in {
+// mtocrf's input needs to be prepared by shifting by an amount dependent
+// on the cr register selected. Thus, post-ra anti-dep breaking must not
+// later change that register assignment.
+let hasExtraDefRegAllocReq = 1 in {
 def MTOCRF: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins gprc:$ST),
                        "mtocrf $FXM, $ST", IIC_BrMCRX>,
             PPC970_DGroup_First, PPC970_Unit_CRU;
 
+// Similarly to mtocrf, the mask for mtcrf must be prepared in a way that
+// is dependent on the cr fields being set.
 def MTCRF : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, gprc:$rS),
                       "mtcrf $FXM, $rS", IIC_BrMCRX>,
             PPC970_MicroCode, PPC970_Unit_CRU;
+} // hasExtraDefRegAllocReq = 1
 
-let hasExtraSrcRegAllocReq = 1 in // to enable post-ra anti-dep breaking.
+// mfocrf's input needs to be prepared by shifting by an amount dependent
+// on the cr register selected. Thus, post-ra anti-dep breaking must not
+// later change that register assignment.
+let hasExtraSrcRegAllocReq = 1 in {
 def MFOCRF: XFXForm_5a<31, 19, (outs gprc:$rT), (ins crbitm:$FXM),
                        "mfocrf $rT, $FXM", IIC_SprMFCRF>,
             PPC970_DGroup_First, PPC970_Unit_CRU;
 
+// Similarly to mfocrf, the mask for mfcrf must be prepared in a way that
+// is dependent on the cr fields being copied.
 def MFCR : XFXForm_3<31, 19, (outs gprc:$rT), (ins),
                      "mfcr $rT", IIC_SprMFCR>,
                      PPC970_MicroCode, PPC970_Unit_CRU;
+} // hasExtraSrcRegAllocReq = 1
 } // hasSideEffects = 0
 
 // Pseudo instruction to perform FADD in round-to-zero mode.
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
index 95f163153c74..9d91e31165de 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
@@ -23,3 +23,24 @@ MCSymbol *PPCFunctionInfo::getPICOffsetSymbol() const {
                                            Twine(MF.getFunctionNumber()) +
                                            "$poff");
 }
+
+MCSymbol *PPCFunctionInfo::getGlobalEPSymbol() const {
+  const DataLayout &DL = MF.getDataLayout();
+  return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+                                           "func_gep" +
+                                           Twine(MF.getFunctionNumber()));
+}
+
+MCSymbol *PPCFunctionInfo::getLocalEPSymbol() const {
+  const DataLayout &DL = MF.getDataLayout();
+  return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+                                           "func_lep" +
+                                           Twine(MF.getFunctionNumber()));
+}
+
+MCSymbol *PPCFunctionInfo::getTOCOffsetSymbol() const {
+  const DataLayout &DL = MF.getDataLayout();
+  return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+                                           "func_toc" +
+                                           Twine(MF.getFunctionNumber()));
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index 607cdf612eef..10a8ce068d40 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -197,6 +197,10 @@ public:
   bool usesPICBase() const { return UsesPICBase; }
 
   MCSymbol *getPICOffsetSymbol() const;
+
+  MCSymbol *getGlobalEPSymbol() const;
+  MCSymbol *getLocalEPSymbol() const;
+  MCSymbol *getTOCOffsetSymbol() const;
 };
 
 } // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index 2dc0d825c80d..a9d2e888f4b7 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -99,6 +99,11 @@ protected:
           break;
         }
 
+        // Don't really need to save data to the stack - the clobbered
+        // registers are already saved when the SDNode (e.g. PPCaddiTlsgdLAddr)
+        // gets translated to the pseudo instruction (e.g. ADDItlsgdLADDR).
+        BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0);
+
         // Expand into two ops built prior to the existing instruction.
         MachineInstr *Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3)
           .addReg(InReg);
@@ -113,6 +118,8 @@ protected:
                               .addReg(GPR3));
         Call->addOperand(MI->getOperand(3));
 
+        BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKUP)).addImm(0).addImm(0);
+
         BuildMI(MBB, I, DL, TII->get(TargetOpcode::COPY), OutReg)
           .addReg(GPR3);
 
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
index 733027a5d2be..05006ac5772b 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -83,7 +83,6 @@ static bool IsIntegerCC(unsigned CC)
   return  (CC <= SPCC::ICC_VC);
 }
 
-
 static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
 {
   switch(CC) {
@@ -124,106 +123,103 @@ static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
   llvm_unreachable("Invalid cond code");
 }
 
-bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
-                                   MachineBasicBlock *&TBB,
-                                   MachineBasicBlock *&FBB,
-                                   SmallVectorImpl<MachineOperand> &Cond,
-                                   bool AllowModify) const
-{
-
-  MachineBasicBlock::iterator I = MBB.end();
-  MachineBasicBlock::iterator UnCondBrIter = MBB.end();
-  while (I != MBB.begin()) {
-    --I;
+static bool isUncondBranchOpcode(int Opc) { return Opc == SP::BA; }
 
-    if (I->isDebugValue())
-      continue;
+static bool isCondBranchOpcode(int Opc) {
+  return Opc == SP::FBCOND || Opc == SP::BCOND;
+}
 
-    // When we see a non-terminator, we are done.
-    if (!isUnpredicatedTerminator(I))
-      break;
+static bool isIndirectBranchOpcode(int Opc) {
+  return Opc == SP::BINDrr || Opc == SP::BINDri;
+}
 
-    // Terminator is not a branch.
-    if (!I->isBranch())
-      return true;
+static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
+                            SmallVectorImpl<MachineOperand> &Cond) {
+  Cond.push_back(MachineOperand::CreateImm(LastInst->getOperand(1).getImm()));
+  Target = LastInst->getOperand(0).getMBB();
+}
 
-    // Handle Unconditional branches.
-    if (I->getOpcode() == SP::BA) {
-      UnCondBrIter = I;
+bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const {
+  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+  if (I == MBB.end())
+    return false;
+
+  if (!isUnpredicatedTerminator(I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+  unsigned LastOpc = LastInst->getOpcode();
+
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (isUncondBranchOpcode(LastOpc)) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    if (isCondBranchOpcode(LastOpc)) {
+      // Block ends with fall-through condbranch.
+      parseCondBranch(LastInst, TBB, Cond);
+      return false;
+    }
+    return true; // Can't handle indirect branch.
+  }
 
-      if (!AllowModify) {
-        TBB = I->getOperand(0).getMBB();
-        continue;
+  // Get the instruction before it if it is a terminator.
+  MachineInstr *SecondLastInst = I;
+  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+
+  // If AllowModify is true and the block ends with two or more unconditional
+  // branches, delete all but the first unconditional branch.
+  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
+    while (isUncondBranchOpcode(SecondLastOpc)) {
+      LastInst->eraseFromParent();
+      LastInst = SecondLastInst;
+      LastOpc = LastInst->getOpcode();
+      if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+        // Return now the only terminator is an unconditional branch.
+        TBB = LastInst->getOperand(0).getMBB();
+        return false;
+      } else {
+        SecondLastInst = I;
+        SecondLastOpc = SecondLastInst->getOpcode();
       }
+    }
+  }
 
-      while (std::next(I) != MBB.end())
-        std::next(I)->eraseFromParent();
-
-      Cond.clear();
-      FBB = nullptr;
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+    return true;
 
-      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
-        TBB = nullptr;
-        I->eraseFromParent();
-        I = MBB.end();
-        UnCondBrIter = MBB.end();
-        continue;
-      }
+  // If the block ends with a B and a Bcc, handle it.
+  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    parseCondBranch(SecondLastInst, TBB, Cond);
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
 
-      TBB = I->getOperand(0).getMBB();
-      continue;
-    }
+  // If the block ends with two unconditional branches, handle it.  The second
+  // one is not executed.
+  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    return false;
+  }
 
-    unsigned Opcode = I->getOpcode();
-    if (Opcode != SP::BCOND && Opcode != SP::FBCOND)
-      return true; // Unknown Opcode.
-
-    SPCC::CondCodes BranchCode = (SPCC::CondCodes)I->getOperand(1).getImm();
-
-    if (Cond.empty()) {
-      MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
-      if (AllowModify && UnCondBrIter != MBB.end() &&
-          MBB.isLayoutSuccessor(TargetBB)) {
-
-        // Transform the code
-        //
-        //    brCC L1
-        //    ba L2
-        // L1:
-        //    ..
-        // L2:
-        //
-        // into
-        //
-        //   brnCC L2
-        // L1:
-        //   ...
-        // L2:
-        //
-        BranchCode = GetOppositeBranchCondition(BranchCode);
-        MachineBasicBlock::iterator OldInst = I;
-        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(Opcode))
-          .addMBB(UnCondBrIter->getOperand(0).getMBB()).addImm(BranchCode);
-        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(SP::BA))
-          .addMBB(TargetBB);
-
-        OldInst->eraseFromParent();
-        UnCondBrIter->eraseFromParent();
-
-        UnCondBrIter = MBB.end();
-        I = MBB.end();
-        continue;
-      }
-      FBB = TBB;
-      TBB = I->getOperand(0).getMBB();
-      Cond.push_back(MachineOperand::CreateImm(BranchCode));
-      continue;
-    }
-    // FIXME: Handle subsequent conditional branches.
-    // For now, we can't handle multiple conditional branches.
+  // ...likewise if it ends with an indirect branch followed by an unconditional
+  // branch.
+  if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
     return true;
   }
-  return false;
+
+  // Otherwise, can't handle this.
+  return true;
 }
 
 unsigned
@@ -277,6 +273,14 @@ unsigned SparcInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const
   return Count;
 }
 
+bool SparcInstrInfo::ReverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 1);
+  SPCC::CondCodes CC = static_cast<SPCC::CondCodes>(Cond[0].getImm());
+  Cond[0].setImm(GetOppositeBranchCondition(CC));
+  return false;
+}
+
 void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I, DebugLoc DL,
                                  unsigned DestReg, unsigned SrcReg,
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h
index 15673f134d80..9de624cc9582 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h
@@ -76,6 +76,9 @@ public:
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
                         DebugLoc DL) const override;
 
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
   void copyPhysReg(MachineBasicBlock &MBB,
                    MachineBasicBlock::iterator I, DebugLoc DL,
                    unsigned DestReg, unsigned SrcReg,
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index b9f2eb5514a5..d5dabc2cd6ab 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -1219,6 +1219,9 @@ def PFDRL : PrefetchRILPC<"pfdrl", 0xC62, z_prefetch>;
 // Atomic operations
 //===----------------------------------------------------------------------===//
 
+// A serialization instruction that acts as a barrier for all memory
+// accesses, which expands to "bcr 14, 0".
+let hasSideEffects = 1 in
 def Serialize : Alias<2, (outs), (ins), [(z_serialize)]>;
 
 let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in {
diff --git a/contrib/llvm/lib/Target/WebAssembly/Disassembler/CMakeLists.txt b/contrib/llvm/lib/Target/WebAssembly/Disassembler/CMakeLists.txt
new file mode 100644
index 000000000000..5e55e2958aeb
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/Disassembler/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMWebAssemblyDisassembler
+  WebAssemblyDisassembler.cpp
+  )
diff --git a/contrib/llvm/lib/Target/WebAssembly/Disassembler/LLVMBuild.txt b/contrib/llvm/lib/Target/WebAssembly/Disassembler/LLVMBuild.txt
new file mode 100644
index 000000000000..a452ca1acd04
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/Disassembler/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===-- ./lib/Target/WebAssembly/Disassembler/LLVMBuild.txt -----*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = WebAssemblyDisassembler
+parent = WebAssembly
+required_libraries = MCDisassembler WebAssemblyInfo Support
+add_to_library_groups = WebAssembly
diff --git a/contrib/llvm/lib/Target/WebAssembly/Disassembler/Makefile b/contrib/llvm/lib/Target/WebAssembly/Disassembler/Makefile
new file mode 100644
index 000000000000..bcd36ba6f01f
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/Disassembler/Makefile
@@ -0,0 +1,16 @@
+##===-- lib/Target/WebAssembly/Disassembler/Makefile -------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMWebAssemblyDisassembler
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
new file mode 100644
index 000000000000..0143b10c0ab1
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -0,0 +1,148 @@
+//==- WebAssemblyDisassembler.cpp - Disassembler for WebAssembly -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file is part of the WebAssembly Disassembler.
+///
+/// It contains code to translate the data produced by the decoder into
+/// MCInsts.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-disassembler"
+
+namespace {
+class WebAssemblyDisassembler final : public MCDisassembler {
+  std::unique_ptr<const MCInstrInfo> MCII;
+
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
+
+public:
+  WebAssemblyDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+                          std::unique_ptr<const MCInstrInfo> MCII)
+      : MCDisassembler(STI, Ctx), MCII(std::move(MCII)) {}
+};
+} // end anonymous namespace
+
+static MCDisassembler *createWebAssemblyDisassembler(const Target &T,
+                                                     const MCSubtargetInfo &STI,
+                                                     MCContext &Ctx) {
+  std::unique_ptr<const MCInstrInfo> MCII(T.createMCInstrInfo());
+  return new WebAssemblyDisassembler(STI, Ctx, std::move(MCII));
+}
+
+extern "C" void LLVMInitializeWebAssemblyDisassembler() {
+  // Register the disassembler for each target.
+  TargetRegistry::RegisterMCDisassembler(TheWebAssemblyTarget32,
+                                         createWebAssemblyDisassembler);
+  TargetRegistry::RegisterMCDisassembler(TheWebAssemblyTarget64,
+                                         createWebAssemblyDisassembler);
+}
+
+MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
+    MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t /*Address*/,
+    raw_ostream &OS, raw_ostream &CS) const {
+  Size = 0;
+  uint64_t Pos = 0;
+
+  // Read the opcode.
+  if (Pos + sizeof(uint64_t) > Bytes.size())
+    return MCDisassembler::Fail;
+  uint64_t Opcode = support::endian::read64le(Bytes.data() + Pos);
+  Pos += sizeof(uint64_t);
+
+  if (Opcode >= WebAssembly::INSTRUCTION_LIST_END)
+    return MCDisassembler::Fail;
+
+  MI.setOpcode(Opcode);
+  const MCInstrDesc &Desc = MCII->get(Opcode);
+  unsigned NumFixedOperands = Desc.NumOperands;
+
+  // If it's variadic, read the number of extra operands.
+  unsigned NumExtraOperands = 0;
+  if (Desc.isVariadic()) {
+    if (Pos + sizeof(uint64_t) > Bytes.size())
+      return MCDisassembler::Fail;
+    NumExtraOperands = support::endian::read64le(Bytes.data() + Pos);
+    Pos += sizeof(uint64_t);
+  }
+
+  // Read the fixed operands. These are described by the MCInstrDesc.
+  for (unsigned i = 0; i < NumFixedOperands; ++i) {
+    const MCOperandInfo &Info = Desc.OpInfo[i];
+    switch (Info.OperandType) {
+    case MCOI::OPERAND_IMMEDIATE:
+    case WebAssembly::OPERAND_BASIC_BLOCK: {
+      if (Pos + sizeof(uint64_t) > Bytes.size())
+        return MCDisassembler::Fail;
+      uint64_t Imm = support::endian::read64le(Bytes.data() + Pos);
+      Pos += sizeof(uint64_t);
+      MI.addOperand(MCOperand::createImm(Imm));
+      break;
+    }
+    case MCOI::OPERAND_REGISTER: {
+      if (Pos + sizeof(uint64_t) > Bytes.size())
+        return MCDisassembler::Fail;
+      uint64_t Reg = support::endian::read64le(Bytes.data() + Pos);
+      Pos += sizeof(uint64_t);
+      MI.addOperand(MCOperand::createReg(Reg));
+      break;
+    }
+    case WebAssembly::OPERAND_FPIMM: {
+      // TODO: MC converts all floating point immediate operands to double.
+      // This is fine for numeric values, but may cause NaNs to change bits.
+      if (Pos + sizeof(uint64_t) > Bytes.size())
+        return MCDisassembler::Fail;
+      uint64_t Bits = support::endian::read64le(Bytes.data() + Pos);
+      Pos += sizeof(uint64_t);
+      double Imm;
+      memcpy(&Imm, &Bits, sizeof(Imm));
+      MI.addOperand(MCOperand::createFPImm(Imm));
+      break;
+    }
+    default:
+      llvm_unreachable("unimplemented operand kind");
+    }
+  }
+
+  // Read the extra operands.
+  assert(NumExtraOperands == 0 || Desc.isVariadic());
+  for (unsigned i = 0; i < NumExtraOperands; ++i) {
+    if (Pos + sizeof(uint64_t) > Bytes.size())
+      return MCDisassembler::Fail;
+    if (Desc.TSFlags & WebAssemblyII::VariableOpIsImmediate) {
+      // Decode extra immediate operands.
+      uint64_t Imm = support::endian::read64le(Bytes.data() + Pos);
+      MI.addOperand(MCOperand::createImm(Imm));
+    } else {
+      // Decode extra register operands.
+      uint64_t Reg = support::endian::read64le(Bytes.data() + Pos);
+      MI.addOperand(MCOperand::createReg(Reg));
+    }
+    Pos += sizeof(uint64_t);
+  }
+
+  Size = Pos;
+  return MCDisassembler::Success;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
index 7ce3a00ae360..9a95150cb557 100644
--- a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
@@ -16,6 +16,8 @@
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "WebAssembly.h"
 #include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -33,7 +35,7 @@ using namespace llvm;
 WebAssemblyInstPrinter::WebAssemblyInstPrinter(const MCAsmInfo &MAI,
                                                const MCInstrInfo &MII,
                                                const MCRegisterInfo &MRI)
-    : MCInstPrinter(MAI, MII, MRI) {}
+    : MCInstPrinter(MAI, MII, MRI), ControlFlowCounter(0) {}
 
 void WebAssemblyInstPrinter::printRegName(raw_ostream &OS,
                                           unsigned RegNo) const {
@@ -59,6 +61,52 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
 
   // Print any added annotation.
   printAnnotation(OS, Annot);
+
+  if (CommentStream) {
+    // Observe any effects on the control flow stack, for use in annotating
+    // control flow label references.
+    switch (MI->getOpcode()) {
+    default:
+      break;
+    case WebAssembly::LOOP: {
+      // Grab the TopLabel value first so that labels print in numeric order.
+      uint64_t TopLabel = ControlFlowCounter++;
+      ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false));
+      printAnnotation(OS, "label" + utostr(TopLabel) + ':');
+      ControlFlowStack.push_back(std::make_pair(TopLabel, true));
+      break;
+    }
+    case WebAssembly::BLOCK:
+      ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false));
+      break;
+    case WebAssembly::END_LOOP:
+      ControlFlowStack.pop_back();
+      printAnnotation(
+          OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':');
+      break;
+    case WebAssembly::END_BLOCK:
+      printAnnotation(
+          OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':');
+      break;
+    }
+
+    // Annotate any control flow label references.
+    unsigned NumFixedOperands = Desc.NumOperands;
+    SmallSet<uint64_t, 8> Printed;
+    for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
+      const MCOperandInfo &Info = Desc.OpInfo[i];
+      if (!(i < NumFixedOperands
+                ? (Info.OperandType == WebAssembly::OPERAND_BASIC_BLOCK)
+                : (Desc.TSFlags & WebAssemblyII::VariableOpImmediateIsLabel)))
+        continue;
+      uint64_t Depth = MI->getOperand(i).getImm();
+      if (!Printed.insert(Depth).second)
+        continue;
+      const auto &Pair = ControlFlowStack.rbegin()[Depth];
+      printAnnotation(OS, utostr(Depth) + ": " + (Pair.second ? "up" : "down") +
+                              " to label" + utostr(Pair.first));
+    }
+  }
 }
 
 static std::string toString(const APFloat &FP) {
@@ -82,6 +130,9 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                           raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg()) {
+    assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() ||
+            MII.get(MI->getOpcode()).TSFlags == 0) &&
+           "WebAssembly variable_ops register ops don't use TSFlags");
     unsigned WAReg = Op.getReg();
     if (int(WAReg) >= 0)
       printRegName(O, WAReg);
@@ -95,19 +146,27 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     if (OpNo < MII.get(MI->getOpcode()).getNumDefs())
       O << '=';
   } else if (Op.isImm()) {
-    switch (MI->getOpcode()) {
-    case WebAssembly::PARAM:
-    case WebAssembly::RESULT:
-    case WebAssembly::LOCAL:
-      O << WebAssembly::TypeToString(MVT::SimpleValueType(Op.getImm()));
-      break;
-    default:
-      O << Op.getImm();
-      break;
-    }
-  } else if (Op.isFPImm())
+    assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() ||
+            (MII.get(MI->getOpcode()).TSFlags &
+             WebAssemblyII::VariableOpIsImmediate)) &&
+           "WebAssemblyII::VariableOpIsImmediate should be set for "
+           "variable_ops immediate ops");
+    // TODO: (MII.get(MI->getOpcode()).TSFlags &
+    //        WebAssemblyII::VariableOpImmediateIsLabel)
+    // can tell us whether this is an immediate referencing a label in the
+    // control flow stack, and it may be nice to pretty-print.
+    O << Op.getImm();
+  } else if (Op.isFPImm()) {
+    assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() ||
+            MII.get(MI->getOpcode()).TSFlags == 0) &&
+           "WebAssembly variable_ops floating point ops don't use TSFlags");
     O << toString(APFloat(Op.getFPImm()));
-  else {
+  } else {
+    assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() ||
+            (MII.get(MI->getOpcode()).TSFlags &
+             WebAssemblyII::VariableOpIsImmediate)) &&
+           "WebAssemblyII::VariableOpIsImmediate should be set for "
+           "variable_ops expr ops");
     assert(Op.isExpr() && "unknown operand kind in printOperand");
     Op.getExpr()->print(O, &MAI);
   }
diff --git a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
index 39a16f59fd78..cd6c59a41c33 100644
--- a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
+++ b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
@@ -23,6 +23,9 @@ namespace llvm {
 class MCSubtargetInfo;
 
 class WebAssemblyInstPrinter final : public MCInstPrinter {
+  uint64_t ControlFlowCounter;
+  SmallVector<std::pair<uint64_t, bool>, 0> ControlFlowStack;
+
 public:
   WebAssemblyInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
                          const MCRegisterInfo &MRI);
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
index b158ccb46f99..bba06f65e169 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -95,9 +95,6 @@ WebAssemblyAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
 }
 } // end anonymous namespace
 
-MCAsmBackend *llvm::createWebAssemblyAsmBackend(const Target &T,
-                                                const MCRegisterInfo &MRI,
-                                                const Triple &TT,
-                                                StringRef CPU) {
+MCAsmBackend *llvm::createWebAssemblyAsmBackend(const Triple &TT) {
   return new WebAssemblyAsmBackend(TT.isArch64Bit());
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp
index c47a3d9094e5..2bb58b33934e 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp
@@ -30,19 +30,31 @@ protected:
 };
 } // end anonymous namespace
 
-// FIXME: Use EM_NONE as a temporary hack. Should we decide to pursue ELF
-// writing seriously, we should email generic-abi@googlegroups.com and ask
-// for our own ELF code.
 WebAssemblyELFObjectWriter::WebAssemblyELFObjectWriter(bool Is64Bit,
                                                        uint8_t OSABI)
-    : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_NONE,
-                              /*HasRelocationAddend=*/true) {}
+    : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_WEBASSEMBLY,
+                              /*HasRelocationAddend=*/false) {}
 
 unsigned WebAssemblyELFObjectWriter::GetRelocType(const MCValue &Target,
                                                   const MCFixup &Fixup,
                                                   bool IsPCRel) const {
-  // FIXME: Do we need our own relocs?
-  return Fixup.getKind();
+  // WebAssembly functions are not allocated in the address space. To resolve a
+  // pointer to a function, we must use a special relocation type.
+  if (const MCSymbolRefExpr *SyExp =
+          dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
+    if (SyExp->getKind() == MCSymbolRefExpr::VK_WebAssembly_FUNCTION)
+      return ELF::R_WEBASSEMBLY_FUNCTION;
+
+  switch (Fixup.getKind()) {
+  case FK_Data_4:
+    assert(!is64Bit() && "4-byte relocations only supported on wasm32");
+    return ELF::R_WEBASSEMBLY_DATA;
+  case FK_Data_8:
+    assert(is64Bit() && "8-byte relocations only supported on wasm64");
+    return ELF::R_WEBASSEMBLY_DATA;
+  default:
+    llvm_unreachable("unimplemented fixup kind");
+  }
 }
 
 MCObjectWriter *llvm::createWebAssemblyELFObjectWriter(raw_pwrite_stream &OS,
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
index d2617796ca99..02c717a92101 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
@@ -27,11 +27,12 @@ WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) {
 
   // TODO: What should MaxInstLength be?
 
-  PrivateGlobalPrefix = "";
-  PrivateLabelPrefix = "";
-
   UseDataRegionDirectives = true;
 
+  // Use .skip instead of .zero because .zero is confusing when used with two
+  // arguments (it doesn't actually zero things out).
+  ZeroDirective = "\t.skip\t";
+
   Data8bitsDirective = "\t.int8\t";
   Data16bitsDirective = "\t.int16\t";
   Data32bitsDirective = "\t.int32\t";
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
index 7c6c79eb5db2..f409bd77442c 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCFixup.h"
@@ -26,75 +27,66 @@ using namespace llvm;
 
 #define DEBUG_TYPE "mccodeemitter"
 
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
+STATISTIC(MCNumFixups, "Number of MC fixups created.");
+
 namespace {
 class WebAssemblyMCCodeEmitter final : public MCCodeEmitter {
-  const MCRegisterInfo &MRI;
-
-public:
-  WebAssemblyMCCodeEmitter(const MCInstrInfo &, const MCRegisterInfo &mri,
-                           MCContext &)
-      : MRI(mri) {}
+  const MCInstrInfo &MCII;
+  const MCContext &Ctx;
 
-  ~WebAssemblyMCCodeEmitter() override {}
-
-  /// TableGen'erated function for getting the binary encoding for an
-  /// instruction.
+  // Implementation generated by tablegen.
   uint64_t getBinaryCodeForInstr(const MCInst &MI,
                                  SmallVectorImpl<MCFixup> &Fixups,
                                  const MCSubtargetInfo &STI) const;
 
-  /// Return binary encoding of operand. If the machine operand requires
-  /// relocation, record the relocation and return zero.
-  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
-
-  uint64_t getMemoryOpValue(const MCInst &MI, unsigned Op,
-                            SmallVectorImpl<MCFixup> &Fixups,
-                            const MCSubtargetInfo &STI) const;
-
   void encodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const override;
+
+public:
+  WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+      : MCII(mcii), Ctx(ctx) {}
 };
 } // end anonymous namespace
 
 MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII,
-                                                    const MCRegisterInfo &MRI,
                                                     MCContext &Ctx) {
-  return new WebAssemblyMCCodeEmitter(MCII, MRI, Ctx);
-}
-
-unsigned WebAssemblyMCCodeEmitter::getMachineOpValue(
-    const MCInst &MI, const MCOperand &MO, SmallVectorImpl<MCFixup> &Fixups,
-    const MCSubtargetInfo &STI) const {
-  if (MO.isReg())
-    return MRI.getEncodingValue(MO.getReg());
-  if (MO.isImm())
-    return static_cast<unsigned>(MO.getImm());
-
-  assert(MO.isExpr());
-
-  assert(MO.getExpr()->getKind() == MCExpr::SymbolRef);
-
-  assert(false && "FIXME: not implemented yet");
-
-  return 0;
+  return new WebAssemblyMCCodeEmitter(MCII, Ctx);
 }
 
 void WebAssemblyMCCodeEmitter::encodeInstruction(
     const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
     const MCSubtargetInfo &STI) const {
-  assert(false && "FIXME: not implemented yet");
-}
-
-// Encode WebAssembly Memory Operand
-uint64_t
-WebAssemblyMCCodeEmitter::getMemoryOpValue(const MCInst &MI, unsigned Op,
-                                           SmallVectorImpl<MCFixup> &Fixups,
-                                           const MCSubtargetInfo &STI) const {
-  assert(false && "FIXME: not implemented yet");
-  return 0;
+  // FIXME: This is not the real binary encoding. This is an extremely
+  // over-simplified encoding where we just use uint64_t for everything. This
+  // is a temporary measure.
+  support::endian::Writer<support::little>(OS).write<uint64_t>(MI.getOpcode());
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  if (Desc.isVariadic())
+    support::endian::Writer<support::little>(OS).write<uint64_t>(
+        MI.getNumOperands() - Desc.NumOperands);
+  for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) {
+    const MCOperand &MO = MI.getOperand(i);
+    if (MO.isReg()) {
+      support::endian::Writer<support::little>(OS).write<uint64_t>(MO.getReg());
+    } else if (MO.isImm()) {
+      support::endian::Writer<support::little>(OS).write<uint64_t>(MO.getImm());
+    } else if (MO.isFPImm()) {
+      support::endian::Writer<support::little>(OS).write<double>(MO.getFPImm());
+    } else if (MO.isExpr()) {
+      support::endian::Writer<support::little>(OS).write<uint64_t>(0);
+      Fixups.push_back(MCFixup::create(
+          (1 + MCII.get(MI.getOpcode()).isVariadic() + i) * sizeof(uint64_t),
+          MO.getExpr(), STI.getTargetTriple().isArch64Bit() ? FK_Data_8 : FK_Data_4,
+          MI.getLoc()));
+      ++MCNumFixups;
+    } else {
+      llvm_unreachable("unexpected operand kind");
+    }
+  }
+
+  ++MCNumEmitted; // Keep track of the # of mi's emitted.
 }
 
 #include "WebAssemblyGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index 14cd295353d5..37000f1cd571 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -15,10 +15,10 @@
 #include "WebAssemblyMCTargetDesc.h"
 #include "InstPrinter/WebAssemblyInstPrinter.h"
 #include "WebAssemblyMCAsmInfo.h"
+#include "WebAssemblyTargetStreamer.h"
 #include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -35,52 +35,89 @@ using namespace llvm;
 #define GET_REGINFO_MC_DESC
 #include "WebAssemblyGenRegisterInfo.inc"
 
-static MCAsmInfo *createWebAssemblyMCAsmInfo(const MCRegisterInfo & /*MRI*/,
-                                             const Triple &TT) {
+static MCAsmInfo *createMCAsmInfo(const MCRegisterInfo & /*MRI*/,
+                                  const Triple &TT) {
   return new WebAssemblyMCAsmInfo(TT);
 }
 
-static MCInstrInfo *createWebAssemblyMCInstrInfo() {
+static MCInstrInfo *createMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitWebAssemblyMCInstrInfo(X);
   return X;
 }
 
-static MCStreamer *createWebAssemblyMCStreamer(const Triple &T, MCContext &Ctx,
-                                               MCAsmBackend &MAB,
-                                               raw_pwrite_stream &OS,
-                                               MCCodeEmitter *Emitter,
-                                               bool RelaxAll) {
-  return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll);
+static MCRegisterInfo *createMCRegisterInfo(const Triple & /*T*/) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitWebAssemblyMCRegisterInfo(X, 0);
+  return X;
 }
 
-static MCInstPrinter *
-createWebAssemblyMCInstPrinter(const Triple & /*T*/, unsigned SyntaxVariant,
-                               const MCAsmInfo &MAI, const MCInstrInfo &MII,
-                               const MCRegisterInfo &MRI) {
+static MCInstPrinter *createMCInstPrinter(const Triple & /*T*/,
+                                          unsigned SyntaxVariant,
+                                          const MCAsmInfo &MAI,
+                                          const MCInstrInfo &MII,
+                                          const MCRegisterInfo &MRI) {
   assert(SyntaxVariant == 0);
   return new WebAssemblyInstPrinter(MAI, MII, MRI);
 }
 
+static MCCodeEmitter *createCodeEmitter(const MCInstrInfo &MCII,
+                                        const MCRegisterInfo & /*MRI*/,
+                                        MCContext &Ctx) {
+  return createWebAssemblyMCCodeEmitter(MCII, Ctx);
+}
+
+static MCAsmBackend *createAsmBackend(const Target & /*T*/,
+                                      const MCRegisterInfo & /*MRI*/,
+                                      const Triple &TT, StringRef /*CPU*/) {
+  return createWebAssemblyAsmBackend(TT);
+}
+
+static MCSubtargetInfo *createMCSubtargetInfo(const Triple &TT, StringRef CPU,
+                                              StringRef FS) {
+  return createWebAssemblyMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static MCTargetStreamer *
+createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo & /*STI*/) {
+  return new WebAssemblyTargetELFStreamer(S);
+}
+
+static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S,
+                                                 formatted_raw_ostream &OS,
+                                                 MCInstPrinter * /*InstPrint*/,
+                                                 bool /*isVerboseAsm*/) {
+  return new WebAssemblyTargetAsmStreamer(S, OS);
+}
+
 // Force static initialization.
 extern "C" void LLVMInitializeWebAssemblyTargetMC() {
   for (Target *T : {&TheWebAssemblyTarget32, &TheWebAssemblyTarget64}) {
     // Register the MC asm info.
-    RegisterMCAsmInfoFn X(*T, createWebAssemblyMCAsmInfo);
+    RegisterMCAsmInfoFn X(*T, createMCAsmInfo);
 
     // Register the MC instruction info.
-    TargetRegistry::RegisterMCInstrInfo(*T, createWebAssemblyMCInstrInfo);
+    TargetRegistry::RegisterMCInstrInfo(*T, createMCInstrInfo);
 
-    // Register the object streamer
-    TargetRegistry::RegisterELFStreamer(*T, createWebAssemblyMCStreamer);
+    // Register the MC register info.
+    TargetRegistry::RegisterMCRegInfo(*T, createMCRegisterInfo);
 
     // Register the MCInstPrinter.
-    TargetRegistry::RegisterMCInstPrinter(*T, createWebAssemblyMCInstPrinter);
+    TargetRegistry::RegisterMCInstPrinter(*T, createMCInstPrinter);
+
+    // Register the MC code emitter.
+    TargetRegistry::RegisterMCCodeEmitter(*T, createCodeEmitter);
+
+    // Register the ASM Backend.
+    TargetRegistry::RegisterMCAsmBackend(*T, createAsmBackend);
 
-    // Register the MC code emitter
-    TargetRegistry::RegisterMCCodeEmitter(*T, createWebAssemblyMCCodeEmitter);
+    // Register the MC subtarget info.
+    TargetRegistry::RegisterMCSubtargetInfo(*T, createMCSubtargetInfo);
 
-    // Register the ASM Backend
-    TargetRegistry::RegisterMCAsmBackend(*T, createWebAssemblyAsmBackend);
+    // Register the object target streamer.
+    TargetRegistry::RegisterObjectTargetStreamer(*T,
+                                                 createObjectTargetStreamer);
+    // Register the asm target streamer.
+    TargetRegistry::RegisterAsmTargetStreamer(*T, createAsmTargetStreamer);
   }
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index e78f73e3da95..9bac4f82822a 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -15,40 +15,62 @@
 #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCTARGETDESC_H
 #define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCTARGETDESC_H
 
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/DataTypes.h"
 
 namespace llvm {
 
-class formatted_raw_ostream;
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
-class MCRegisterInfo;
 class MCObjectWriter;
-class MCStreamer;
 class MCSubtargetInfo;
-class MCTargetStreamer;
-class StringRef;
 class Target;
 class Triple;
-class raw_ostream;
 class raw_pwrite_stream;
 
 extern Target TheWebAssemblyTarget32;
 extern Target TheWebAssemblyTarget64;
 
 MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII,
-                                              const MCRegisterInfo &MRI,
                                               MCContext &Ctx);
 
-MCAsmBackend *createWebAssemblyAsmBackend(const Target &T,
-                                          const MCRegisterInfo &MRI,
-                                          const Triple &TT, StringRef CPU);
+MCAsmBackend *createWebAssemblyAsmBackend(const Triple &TT);
 
 MCObjectWriter *createWebAssemblyELFObjectWriter(raw_pwrite_stream &OS,
                                                  bool Is64Bit, uint8_t OSABI);
 
+namespace WebAssembly {
+enum OperandType {
+  /// Basic block label in a branch construct.
+  OPERAND_BASIC_BLOCK = MCOI::OPERAND_FIRST_TARGET,
+  /// Floating-point immediate.
+  OPERAND_FPIMM
+};
+
+/// WebAssembly-specific directive identifiers.
+enum Directive {
+  // FIXME: This is not the real binary encoding.
+  DotParam = UINT64_MAX - 0,   ///< .param
+  DotResult = UINT64_MAX - 1,  ///< .result
+  DotLocal = UINT64_MAX - 2,   ///< .local
+  DotEndFunc = UINT64_MAX - 3, ///< .endfunc
+};
+
+} // end namespace WebAssembly
+
+namespace WebAssemblyII {
+enum {
+  // For variadic instructions, this flag indicates whether an operand
+  // in the variable_ops range is an immediate value.
+  VariableOpIsImmediate = (1 << 0),
+  // For immediate values in the variable_ops range, this flag indicates
+  // whether the value represents a control-flow label.
+  VariableOpImmediateIsLabel = (1 << 1),
+};
+} // end namespace WebAssemblyII
+
 } // end namespace llvm
 
 // Defines symbolic names for WebAssembly registers. This defines a mapping from
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
new file mode 100644
index 000000000000..1d2822869a15
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
@@ -0,0 +1,94 @@
+//==-- WebAssemblyTargetStreamer.cpp - WebAssembly Target Streamer Methods --=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines WebAssembly-specific target streamer classes.
+/// These are for implementing support for target-specific assembly directives.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyTargetStreamer.h"
+#include "InstPrinter/WebAssemblyInstPrinter.h"
+#include "WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyTargetObjectFile.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+WebAssemblyTargetStreamer::WebAssemblyTargetStreamer(MCStreamer &S)
+    : MCTargetStreamer(S) {}
+
+WebAssemblyTargetAsmStreamer::WebAssemblyTargetAsmStreamer(
+    MCStreamer &S, formatted_raw_ostream &OS)
+    : WebAssemblyTargetStreamer(S), OS(OS) {}
+
+WebAssemblyTargetELFStreamer::WebAssemblyTargetELFStreamer(MCStreamer &S)
+    : WebAssemblyTargetStreamer(S) {}
+
+static void PrintTypes(formatted_raw_ostream &OS, ArrayRef<MVT> Types) {
+  bool First = true;
+  for (MVT Type : Types) {
+    if (First)
+      First = false;
+    else
+      OS << ", ";
+    OS << WebAssembly::TypeToString(Type);
+  }
+  OS << '\n';
+}
+
+void WebAssemblyTargetAsmStreamer::emitParam(ArrayRef<MVT> Types) {
+  OS << "\t.param  \t";
+  PrintTypes(OS, Types);
+}
+
+void WebAssemblyTargetAsmStreamer::emitResult(ArrayRef<MVT> Types) {
+  OS << "\t.result \t";
+  PrintTypes(OS, Types);
+}
+
+void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef<MVT> Types) {
+  OS << "\t.local  \t";
+  PrintTypes(OS, Types);
+}
+
+void WebAssemblyTargetAsmStreamer::emitEndFunc() { OS << "\t.endfunc\n"; }
+
+// FIXME: What follows is not the real binary encoding.
+
+static void EncodeTypes(MCStreamer &Streamer, ArrayRef<MVT> Types) {
+  Streamer.EmitIntValue(Types.size(), sizeof(uint64_t));
+  for (MVT Type : Types)
+    Streamer.EmitIntValue(Type.SimpleTy, sizeof(uint64_t));
+}
+
+void WebAssemblyTargetELFStreamer::emitParam(ArrayRef<MVT> Types) {
+  Streamer.EmitIntValue(WebAssembly::DotParam, sizeof(uint64_t));
+  EncodeTypes(Streamer, Types);
+}
+
+void WebAssemblyTargetELFStreamer::emitResult(ArrayRef<MVT> Types) {
+  Streamer.EmitIntValue(WebAssembly::DotResult, sizeof(uint64_t));
+  EncodeTypes(Streamer, Types);
+}
+
+void WebAssemblyTargetELFStreamer::emitLocal(ArrayRef<MVT> Types) {
+  Streamer.EmitIntValue(WebAssembly::DotLocal, sizeof(uint64_t));
+  EncodeTypes(Streamer, Types);
+}
+
+void WebAssemblyTargetELFStreamer::emitEndFunc() {
+  Streamer.EmitIntValue(WebAssembly::DotEndFunc, sizeof(uint64_t));
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
new file mode 100644
index 000000000000..c66a51574efb
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
@@ -0,0 +1,68 @@
+//==-- WebAssemblyTargetStreamer.h - WebAssembly Target Streamer -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares WebAssembly-specific target streamer classes.
+/// These are for implementing support for target-specific assembly directives.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYTARGETSTREAMER_H
+
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+
+class MCELFStreamer;
+
+/// WebAssembly-specific streamer interface, to implement support
+/// WebAssembly-specific assembly directives.
+class WebAssemblyTargetStreamer : public MCTargetStreamer {
+public:
+  explicit WebAssemblyTargetStreamer(MCStreamer &S);
+
+  /// .param
+  virtual void emitParam(ArrayRef<MVT> Types) = 0;
+  /// .result
+  virtual void emitResult(ArrayRef<MVT> Types) = 0;
+  /// .local
+  virtual void emitLocal(ArrayRef<MVT> Types) = 0;
+  /// .endfunc
+  virtual void emitEndFunc() = 0;
+};
+
+/// This part is for ascii assembly output
+class WebAssemblyTargetAsmStreamer final : public WebAssemblyTargetStreamer {
+  formatted_raw_ostream &OS;
+
+public:
+  WebAssemblyTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+
+  void emitParam(ArrayRef<MVT> Types) override;
+  void emitResult(ArrayRef<MVT> Types) override;
+  void emitLocal(ArrayRef<MVT> Types) override;
+  void emitEndFunc() override;
+};
+
+/// This part is for ELF object output
+class WebAssemblyTargetELFStreamer final : public WebAssemblyTargetStreamer {
+public:
+  explicit WebAssemblyTargetELFStreamer(MCStreamer &S);
+
+  void emitParam(ArrayRef<MVT> Types) override;
+  void emitResult(ArrayRef<MVT> Types) override;
+  void emitLocal(ArrayRef<MVT> Types) override;
+  void emitEndFunc() override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 0d2b4d9debb9..45ac99d90ed9 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -17,6 +17,7 @@
 #include "WebAssembly.h"
 #include "InstPrinter/WebAssemblyInstPrinter.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "MCTargetDesc/WebAssemblyTargetStreamer.h"
 #include "WebAssemblyMCInstLower.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblyRegisterInfo.h"
@@ -69,7 +70,9 @@ private:
   void EmitJumpTableInfo() override;
   void EmitConstantPool() override;
   void EmitFunctionBodyStart() override;
+  void EmitFunctionBodyEnd() override;
   void EmitInstruction(const MachineInstr *MI) override;
+  const MCExpr *lowerConstant(const Constant *CV) override;
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                        unsigned AsmVariant, const char *ExtraCode,
                        raw_ostream &OS) override;
@@ -80,6 +83,7 @@ private:
   MVT getRegType(unsigned RegNo) const;
   const char *toString(MVT VT) const;
   std::string regToString(const MachineOperand &MO);
+  WebAssemblyTargetStreamer *getTargetStreamer();
 };
 
 } // end anonymous namespace
@@ -90,9 +94,9 @@ private:
 
 MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const {
   const TargetRegisterClass *TRC =
-      TargetRegisterInfo::isVirtualRegister(RegNo) ?
-      MRI->getRegClass(RegNo) :
-      MRI->getTargetRegisterInfo()->getMinimalPhysRegClass(RegNo);
+      TargetRegisterInfo::isVirtualRegister(RegNo)
+          ? MRI->getRegClass(RegNo)
+          : MRI->getTargetRegisterInfo()->getMinimalPhysRegClass(RegNo);
   for (MVT T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64})
     if (TRC->hasType(T))
       return T;
@@ -101,6 +105,10 @@ MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const {
   return MVT::Other;
 }
 
+const char *WebAssemblyAsmPrinter::toString(MVT VT) const {
+  return WebAssembly::TypeToString(VT);
+}
+
 std::string WebAssemblyAsmPrinter::regToString(const MachineOperand &MO) {
   unsigned RegNo = MO.getReg();
   assert(TargetRegisterInfo::isVirtualRegister(RegNo) &&
@@ -111,8 +119,10 @@ std::string WebAssemblyAsmPrinter::regToString(const MachineOperand &MO) {
   return '$' + utostr(WAReg);
 }
 
-const char *WebAssemblyAsmPrinter::toString(MVT VT) const {
-  return WebAssembly::TypeToString(VT);
+WebAssemblyTargetStreamer *
+WebAssemblyAsmPrinter::getTargetStreamer() {
+  MCTargetStreamer *TS = OutStreamer->getTargetStreamer();
+  return static_cast<WebAssemblyTargetStreamer *>(TS);
 }
 
 //===----------------------------------------------------------------------===//
@@ -145,29 +155,20 @@ static void ComputeLegalValueVTs(const Function &F, const TargetMachine &TM,
 }
 
 void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
-  if (!MFI->getParams().empty()) {
-    MCInst Param;
-    Param.setOpcode(WebAssembly::PARAM);
-    for (MVT VT : MFI->getParams())
-      Param.addOperand(MCOperand::createImm(VT.SimpleTy));
-    EmitToStreamer(*OutStreamer, Param);
-  }
+  if (!MFI->getParams().empty())
+    getTargetStreamer()->emitParam(MFI->getParams());
 
   SmallVector<MVT, 4> ResultVTs;
   const Function &F(*MF->getFunction());
   ComputeLegalValueVTs(F, TM, F.getReturnType(), ResultVTs);
+
   // If the return type needs to be legalized it will get converted into
   // passing a pointer.
-  if (ResultVTs.size() == 1) {
-    MCInst Result;
-    Result.setOpcode(WebAssembly::RESULT);
-    Result.addOperand(MCOperand::createImm(ResultVTs.front().SimpleTy));
-    EmitToStreamer(*OutStreamer, Result);
-  }
+  if (ResultVTs.size() == 1)
+    getTargetStreamer()->emitResult(ResultVTs);
 
   bool AnyWARegs = false;
-  MCInst Local;
-  Local.setOpcode(WebAssembly::LOCAL);
+  SmallVector<MVT, 16> LocalTypes;
   for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) {
     unsigned VReg = TargetRegisterInfo::index2VirtReg(Idx);
     unsigned WAReg = MFI->getWAReg(VReg);
@@ -180,22 +181,26 @@ void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
     // Don't declare stackified registers.
     if (int(WAReg) < 0)
       continue;
-    Local.addOperand(MCOperand::createImm(getRegType(VReg).SimpleTy));
+    LocalTypes.push_back(getRegType(VReg));
     AnyWARegs = true;
   }
   auto &PhysRegs = MFI->getPhysRegs();
   for (unsigned PReg = 0; PReg < PhysRegs.size(); ++PReg) {
     if (PhysRegs[PReg] == -1U)
       continue;
-    Local.addOperand(MCOperand::createImm(getRegType(PReg).SimpleTy));
+    LocalTypes.push_back(getRegType(PReg));
     AnyWARegs = true;
   }
   if (AnyWARegs)
-    EmitToStreamer(*OutStreamer, Local);
+    getTargetStreamer()->emitLocal(LocalTypes);
 
   AsmPrinter::EmitFunctionBodyStart();
 }
 
+void WebAssemblyAsmPrinter::EmitFunctionBodyEnd() {
+  getTargetStreamer()->emitEndFunc();
+}
+
 void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   DEBUG(dbgs() << "EmitInstruction: " << *MI << '\n');
 
@@ -207,10 +212,6 @@ void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // These represent values which are live into the function entry, so there's
     // no instruction to emit.
     break;
-  case WebAssembly::LOOP_END:
-    // This is a no-op which just exists to tell AsmPrinter.cpp that there's a
-    // fallthrough which nevertheless requires a label for the destination here.
-    break;
   default: {
     WebAssemblyMCInstLower MCInstLowering(OutContext, *this);
     MCInst TmpInst;
@@ -221,6 +222,14 @@ void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
 }
 
+const MCExpr *WebAssemblyAsmPrinter::lowerConstant(const Constant *CV) {
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
+    if (GV->getValueType()->isFunctionTy())
+      return MCSymbolRefExpr::create(
+          getSymbol(GV), MCSymbolRefExpr::VK_WebAssembly_FUNCTION, OutContext);
+  return AsmPrinter::lowerConstant(CV);
+}
+
 bool WebAssemblyAsmPrinter::PrintAsmOperand(const MachineInstr *MI,
                                             unsigned OpNo, unsigned AsmVariant,
                                             const char *ExtraCode,
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index e9671ee07e69..a39349c562fd 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -34,6 +34,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -256,7 +257,8 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI) {
 /// code) for a branch instruction to both branch to a block and fallthrough
 /// to it, so we check the actual branch operands to see if there are any
 /// explicit mentions.
-static bool ExplicitlyBranchesTo(MachineBasicBlock *Pred, MachineBasicBlock *MBB) {
+static bool ExplicitlyBranchesTo(MachineBasicBlock *Pred,
+                                 MachineBasicBlock *MBB) {
   for (MachineInstr &MI : Pred->terminators())
     for (MachineOperand &MO : MI.explicit_operands())
       if (MO.isMBB() && MO.getMBB() == MBB)
@@ -325,13 +327,21 @@ static void PlaceBlockMarker(MachineBasicBlock &MBB, MachineFunction &MF,
     InsertPos = Header->getFirstTerminator();
     while (InsertPos != Header->begin() &&
            prev(InsertPos)->definesRegister(WebAssembly::EXPR_STACK) &&
-           prev(InsertPos)->getOpcode() != WebAssembly::LOOP)
+           prev(InsertPos)->getOpcode() != WebAssembly::LOOP &&
+           prev(InsertPos)->getOpcode() != WebAssembly::END_BLOCK &&
+           prev(InsertPos)->getOpcode() != WebAssembly::END_LOOP)
       --InsertPos;
   }
 
   // Add the BLOCK.
-  BuildMI(*Header, InsertPos, DebugLoc(), TII.get(WebAssembly::BLOCK))
-      .addMBB(&MBB);
+  BuildMI(*Header, InsertPos, DebugLoc(), TII.get(WebAssembly::BLOCK));
+
+  // Mark the end of the block.
+  InsertPos = MBB.begin();
+  while (InsertPos != MBB.end() &&
+         InsertPos->getOpcode() == WebAssembly::END_LOOP)
+    ++InsertPos;
+  BuildMI(MBB, InsertPos, DebugLoc(), TII.get(WebAssembly::END_BLOCK));
 
   // Track the farthest-spanning scope that ends at this point.
   int Number = MBB.getNumber();
@@ -341,10 +351,11 @@ static void PlaceBlockMarker(MachineBasicBlock &MBB, MachineFunction &MF,
 }
 
 /// Insert a LOOP marker for a loop starting at MBB (if it's a loop header).
-static void PlaceLoopMarker(MachineBasicBlock &MBB, MachineFunction &MF,
-                            SmallVectorImpl<MachineBasicBlock *> &ScopeTops,
-                            const WebAssemblyInstrInfo &TII,
-                            const MachineLoopInfo &MLI) {
+static void PlaceLoopMarker(
+    MachineBasicBlock &MBB, MachineFunction &MF,
+    SmallVectorImpl<MachineBasicBlock *> &ScopeTops,
+    DenseMap<const MachineInstr *, const MachineBasicBlock *> &LoopTops,
+    const WebAssemblyInstrInfo &TII, const MachineLoopInfo &MLI) {
   MachineLoop *Loop = MLI.getLoopFor(&MBB);
   if (!Loop || Loop->getHeader() != &MBB)
     return;
@@ -361,14 +372,19 @@ static void PlaceLoopMarker(MachineBasicBlock &MBB, MachineFunction &MF,
     Iter = next(MachineFunction::iterator(Bottom));
   }
   MachineBasicBlock *AfterLoop = &*Iter;
-  BuildMI(MBB, MBB.begin(), DebugLoc(), TII.get(WebAssembly::LOOP))
-      .addMBB(AfterLoop);
 
-  // Emit a special no-op telling the asm printer that we need a label to close
-  // the loop scope, even though the destination is only reachable by
-  // fallthrough.
-  if (!Bottom->back().isBarrier())
-    BuildMI(*Bottom, Bottom->end(), DebugLoc(), TII.get(WebAssembly::LOOP_END));
+  // Mark the beginning of the loop (after the end of any existing loop that
+  // ends here).
+  auto InsertPos = MBB.begin();
+  while (InsertPos != MBB.end() &&
+         InsertPos->getOpcode() == WebAssembly::END_LOOP)
+    ++InsertPos;
+  BuildMI(MBB, InsertPos, DebugLoc(), TII.get(WebAssembly::LOOP));
+
+  // Mark the end of the loop.
+  MachineInstr *End = BuildMI(*AfterLoop, AfterLoop->begin(), DebugLoc(),
+                              TII.get(WebAssembly::END_LOOP));
+  LoopTops[End] = &MBB;
 
   assert((!ScopeTops[AfterLoop->getNumber()] ||
           ScopeTops[AfterLoop->getNumber()]->getNumber() < MBB.getNumber()) &&
@@ -377,6 +393,19 @@ static void PlaceLoopMarker(MachineBasicBlock &MBB, MachineFunction &MF,
     ScopeTops[AfterLoop->getNumber()] = &MBB;
 }
 
+static unsigned
+GetDepth(const SmallVectorImpl<const MachineBasicBlock *> &Stack,
+         const MachineBasicBlock *MBB) {
+  unsigned Depth = 0;
+  for (auto X : reverse(Stack)) {
+    if (X == MBB)
+      break;
+    ++Depth;
+  }
+  assert(Depth < Stack.size() && "Branch destination should be in scope");
+  return Depth;
+}
+
 /// Insert LOOP and BLOCK markers at appropriate places.
 static void PlaceMarkers(MachineFunction &MF, const MachineLoopInfo &MLI,
                          const WebAssemblyInstrInfo &TII,
@@ -388,25 +417,57 @@ static void PlaceMarkers(MachineFunction &MF, const MachineLoopInfo &MLI,
   // we may insert at the end.
   SmallVector<MachineBasicBlock *, 8> ScopeTops(MF.getNumBlockIDs() + 1);
 
+  // For eacn LOOP_END, the corresponding LOOP.
+  DenseMap<const MachineInstr *, const MachineBasicBlock *> LoopTops;
+
   for (auto &MBB : MF) {
     // Place the LOOP for MBB if MBB is the header of a loop.
-    PlaceLoopMarker(MBB, MF, ScopeTops, TII, MLI);
+    PlaceLoopMarker(MBB, MF, ScopeTops, LoopTops, TII, MLI);
 
     // Place the BLOCK for MBB if MBB is branched to from above.
     PlaceBlockMarker(MBB, MF, ScopeTops, TII, MLI, MDT);
   }
-}
 
-#ifndef NDEBUG
-static bool
-IsOnStack(const SmallVectorImpl<std::pair<MachineBasicBlock *, bool>> &Stack,
-          const MachineBasicBlock *MBB) {
-  for (const auto &Pair : Stack)
-    if (Pair.first == MBB)
-      return true;
-  return false;
+  // Now rewrite references to basic blocks to be depth immediates.
+  SmallVector<const MachineBasicBlock *, 8> Stack;
+  for (auto &MBB : reverse(MF)) {
+    for (auto &MI : reverse(MBB)) {
+      switch (MI.getOpcode()) {
+      case WebAssembly::BLOCK:
+        assert(ScopeTops[Stack.back()->getNumber()] == &MBB &&
+               "Block should be balanced");
+        Stack.pop_back();
+        break;
+      case WebAssembly::LOOP:
+        assert(Stack.back() == &MBB && "Loop top should be balanced");
+        Stack.pop_back();
+        Stack.pop_back();
+        break;
+      case WebAssembly::END_BLOCK:
+        Stack.push_back(&MBB);
+        break;
+      case WebAssembly::END_LOOP:
+        Stack.push_back(&MBB);
+        Stack.push_back(LoopTops[&MI]);
+        break;
+      default:
+        if (MI.isTerminator()) {
+          // Rewrite MBB operands to be depth immediates.
+          SmallVector<MachineOperand, 4> Ops(MI.operands());
+          while (MI.getNumOperands() > 0)
+            MI.RemoveOperand(MI.getNumOperands() - 1);
+          for (auto MO : Ops) {
+            if (MO.isMBB())
+              MO = MachineOperand::CreateImm(GetDepth(Stack, MO.getMBB()));
+            MI.addOperand(MF, MO);
+          }
+        }
+        break;
+      }
+    }
+  }
+  assert(Stack.empty() && "Control flow should be balanced");
 }
-#endif
 
 bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "********** CFG Stackifying **********\n"
@@ -415,7 +476,9 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
 
   const auto &MLI = getAnalysis<MachineLoopInfo>();
   auto &MDT = getAnalysis<MachineDominatorTree>();
+  // Liveness is not tracked for EXPR_STACK physreg.
   const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  MF.getRegInfo().invalidateLiveness();
 
   // RPO sorting needs all loops to be single-entry.
   EliminateMultipleEntryLoops(MF, MLI);
@@ -426,43 +489,5 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
   // Place the BLOCK and LOOP markers to indicate the beginnings of scopes.
   PlaceMarkers(MF, MLI, TII, MDT);
 
-#ifndef NDEBUG
-  // Verify that block and loop beginnings and endings are in LIFO order, and
-  // that all references to blocks are to blocks on the stack at the point of
-  // the reference.
-  SmallVector<std::pair<MachineBasicBlock *, bool>, 0> Stack;
-  for (auto &MBB : MF) {
-    while (!Stack.empty() && Stack.back().first == &MBB)
-      if (Stack.back().second) {
-        assert(Stack.size() >= 2);
-        Stack.pop_back();
-        Stack.pop_back();
-      } else {
-        assert(Stack.size() >= 1);
-        Stack.pop_back();
-      }
-    for (auto &MI : MBB)
-      switch (MI.getOpcode()) {
-      case WebAssembly::LOOP:
-        Stack.push_back(std::make_pair(&MBB, false));
-        Stack.push_back(std::make_pair(MI.getOperand(0).getMBB(), true));
-        break;
-      case WebAssembly::BLOCK:
-        Stack.push_back(std::make_pair(MI.getOperand(0).getMBB(), false));
-        break;
-      default:
-        // Verify that all referenced blocks are in scope. A reference to a
-        // block with a negative number is invalid, but can happen with inline
-        // asm, so we shouldn't assert on it, but instead let CodeGen properly
-        // fail on it.
-        for (const MachineOperand &MO : MI.explicit_operands())
-          if (MO.isMBB() && MO.getMBB()->getNumber() >= 0)
-            assert(IsOnStack(Stack, MO.getMBB()));
-        break;
-      }
-  }
-  assert(Stack.empty());
-#endif
-
   return true;
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 7a89f788c1ad..e9933b092988 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -573,7 +573,8 @@ SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op,
   SDLoc DL(Op);
   const auto *GA = cast<GlobalAddressSDNode>(Op);
   EVT VT = Op.getValueType();
-  assert(GA->getTargetFlags() == 0 && "WebAssembly doesn't set target flags");
+  assert(GA->getTargetFlags() == 0 &&
+         "Unexpected target flags on generic GlobalAddressSDNode");
   if (GA->getAddressSpace() != 0)
     fail(DL, DAG, "WebAssembly only expects the 0 address space");
   return DAG.getNode(
@@ -587,9 +588,16 @@ WebAssemblyTargetLowering::LowerExternalSymbol(SDValue Op,
   SDLoc DL(Op);
   const auto *ES = cast<ExternalSymbolSDNode>(Op);
   EVT VT = Op.getValueType();
-  assert(ES->getTargetFlags() == 0 && "WebAssembly doesn't set target flags");
+  assert(ES->getTargetFlags() == 0 &&
+         "Unexpected target flags on generic ExternalSymbolSDNode");
+  // Set the TargetFlags to 0x1 which indicates that this is a "function"
+  // symbol rather than a data symbol. We do this unconditionally even though
+  // we don't know anything about the symbol other than its name, because all
+  // external symbols used in target-independent SelectionDAG code are for
+  // functions.
   return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT,
-                     DAG.getTargetExternalSymbol(ES->getSymbol(), VT));
+                     DAG.getTargetExternalSymbol(ES->getSymbol(), VT,
+                                                 /*TargetFlags=*/0x1));
 }
 
 SDValue WebAssemblyTargetLowering::LowerJumpTable(SDValue Op,
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 05efe8903413..fda95953db81 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -41,28 +41,33 @@ let Defs = [ARGUMENTS] in {
 // TODO: SelectionDAG's lowering insists on using a pointer as the index for
 // jump tables, so in practice we don't ever use TABLESWITCH_I64 in wasm32 mode
 // currently.
+// Set TSFlags{0} to 1 to indicate that the variable_ops are immediates.
+// Set TSFlags{1} to 1 to indicate that the immediates represent labels.
 let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
 def TABLESWITCH_I32 : I<(outs), (ins I32:$index, bb_op:$default, variable_ops),
                         [(WebAssemblytableswitch I32:$index, bb:$default)],
-                        "tableswitch\t$index, $default">;
+                        "tableswitch\t$index, $default"> {
+  let TSFlags{0} = 1;
+  let TSFlags{1} = 1;
+}
 def TABLESWITCH_I64 : I<(outs), (ins I64:$index, bb_op:$default, variable_ops),
                         [(WebAssemblytableswitch I64:$index, bb:$default)],
-                        "tableswitch\t$index, $default">;
+                        "tableswitch\t$index, $default"> {
+  let TSFlags{0} = 1;
+  let TSFlags{1} = 1;
+}
 } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
 
-// Placemarkers to indicate the start of a block or loop scope. These
+// Placemarkers to indicate the start or end of a block or loop scope. These
 // use/clobber EXPR_STACK to prevent them from being moved into the middle of
 // an expression tree.
 let Uses = [EXPR_STACK], Defs = [EXPR_STACK] in {
-def BLOCK     : I<(outs), (ins bb_op:$dst), [], "block   \t$dst">;
-def LOOP      : I<(outs), (ins bb_op:$dst), [], "loop    \t$dst">;
+def BLOCK     : I<(outs), (ins), [], "block">;
+def LOOP      : I<(outs), (ins), [], "loop">;
+def END_BLOCK : I<(outs), (ins), [], "end_block">;
+def END_LOOP  : I<(outs), (ins), [], "end_loop">;
 } // Uses = [EXPR_STACK], Defs = [EXPR_STACK]
 
-// No-op to indicate to the AsmPrinter that a loop ends here, so a
-// basic block label is needed even if it wouldn't otherwise appear so.
-let isTerminator = 1, hasCtrlDep = 1 in
-def LOOP_END : I<(outs), (ins), []>;
-
 multiclass RETURN<WebAssemblyRegClass vt> {
   def RETURN_#vt : I<(outs), (ins vt:$val), [(WebAssemblyreturn vt:$val)],
                      "return  \t$val">;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index 5e7663cdb506..028e9af0834f 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -74,6 +74,9 @@ bool WebAssemblyInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
     case WebAssembly::BR_IF:
       if (HaveCond)
         return true;
+      // If we're running after CFGStackify, we can't optimize further.
+      if (!MI.getOperand(1).isMBB())
+        return true;
       Cond.push_back(MachineOperand::CreateImm(true));
       Cond.push_back(MI.getOperand(0));
       TBB = MI.getOperand(1).getMBB();
@@ -82,12 +85,18 @@ bool WebAssemblyInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
     case WebAssembly::BR_UNLESS:
       if (HaveCond)
         return true;
+      // If we're running after CFGStackify, we can't optimize further.
+      if (!MI.getOperand(1).isMBB())
+        return true;
       Cond.push_back(MachineOperand::CreateImm(false));
       Cond.push_back(MI.getOperand(0));
       TBB = MI.getOperand(1).getMBB();
       HaveCond = true;
       break;
     case WebAssembly::BR:
+      // If we're running after CFGStackify, we can't optimize further.
+      if (!MI.getOperand(0).isMBB())
+        return true;
       if (!HaveCond)
         TBB = MI.getOperand(0).getMBB();
       else
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index f0b4ce7caf51..2e682a475471 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -66,8 +66,18 @@ def WebAssemblywrapper  : SDNode<"WebAssemblyISD::Wrapper",
 // WebAssembly-specific Operands.
 //===----------------------------------------------------------------------===//
 
+let OperandNamespace = "WebAssembly" in {
+
+let OperandType = "OPERAND_BASIC_BLOCK" in
 def bb_op : Operand<OtherVT>;
 
+let OperandType = "OPERAND_FPIMM" in {
+def f32imm_op : Operand<f32>;
+def f64imm_op : Operand<f64>;
+} // OperandType = "OPERAND_FPIMM"
+
+} // OperandNamespace = "WebAssembly"
+
 //===----------------------------------------------------------------------===//
 // WebAssembly Instruction Format Definitions.
 //===----------------------------------------------------------------------===//
@@ -120,31 +130,20 @@ def CONST_I32 : I<(outs I32:$res), (ins i32imm:$imm),
 def CONST_I64 : I<(outs I64:$res), (ins i64imm:$imm),
                   [(set I64:$res, imm:$imm)],
                   "i64.const\t$res, $imm">;
-def CONST_F32 : I<(outs F32:$res), (ins f32imm:$imm),
+def CONST_F32 : I<(outs F32:$res), (ins f32imm_op:$imm),
                   [(set F32:$res, fpimm:$imm)],
                   "f32.const\t$res, $imm">;
-def CONST_F64 : I<(outs F64:$res), (ins f64imm:$imm),
+def CONST_F64 : I<(outs F64:$res), (ins f64imm_op:$imm),
                   [(set F64:$res, fpimm:$imm)],
                   "f64.const\t$res, $imm">;
 } // isMoveImm = 1
 
 } // Defs = [ARGUMENTS]
 
-def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$dst)),
-          (CONST_I32 tglobaladdr:$dst)>;
-def : Pat<(i32 (WebAssemblywrapper texternalsym:$dst)),
-          (CONST_I32 texternalsym:$dst)>;
-def : Pat<(i32 (WebAssemblywrapper tjumptable:$dst)),
-          (CONST_I32 tjumptable:$dst)>;
-
-let Defs = [ARGUMENTS] in {
-
-// Function signature and local variable declaration "instructions".
-def PARAM  : I<(outs), (ins variable_ops), [], ".param  \t">;
-def RESULT : I<(outs), (ins variable_ops), [], ".result \t">;
-def LOCAL  : I<(outs), (ins variable_ops), [], ".local  \t">;
-
-} // Defs = [ARGUMENTS]
+def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)),
+          (CONST_I32 tglobaladdr:$addr)>;
+def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)),
+          (CONST_I32 texternalsym:$addr)>;
 
 //===----------------------------------------------------------------------===//
 // Additional sets of instructions.
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index 74ec45d58644..b39ac5212f87 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -24,10 +24,25 @@
 // WebAssembly constant offsets are performed as unsigned with infinite
 // precision, so we need to check for NoUnsignedWrap so that we don't fold an
 // offset for an add that needs wrapping.
-def regPlusImm : PatFrag<(ops node:$off, node:$addr),
+def regPlusImm : PatFrag<(ops node:$addr, node:$off),
                          (add node:$addr, node:$off),
                          [{ return N->getFlags()->hasNoUnsignedWrap(); }]>;
 
+// GlobalAddresses are conceptually unsigned values, so we can also fold them
+// into immediate values as long as their offsets are non-negative.
+def regPlusGA : PatFrag<(ops node:$addr, node:$off),
+                        (add node:$addr, node:$off),
+                        [{
+  return N->getFlags()->hasNoUnsignedWrap() ||
+         (N->getOperand(1)->getOpcode() == WebAssemblyISD::Wrapper &&
+          isa<GlobalAddressSDNode>(N->getOperand(1)->getOperand(0)) &&
+          cast<GlobalAddressSDNode>(N->getOperand(1)->getOperand(0))
+             ->getOffset() >= 0);
+}]>;
+
+// We don't need a regPlusES because external symbols never have constant
+// offsets folded into them, so we can just use add.
+
 let Defs = [ARGUMENTS] in {
 
 // Basic load.
@@ -49,29 +64,33 @@ def : Pat<(f32 (load I32:$addr)), (LOAD_F32 0, $addr)>;
 def : Pat<(f64 (load I32:$addr)), (LOAD_F64 0, $addr)>;
 
 // Select loads with a constant offset.
-def : Pat<(i32 (load (regPlusImm imm:$off, I32:$addr))),
+def : Pat<(i32 (load (regPlusImm I32:$addr, imm:$off))),
           (LOAD_I32 imm:$off, $addr)>;
-def : Pat<(i64 (load (regPlusImm imm:$off, I32:$addr))),
+def : Pat<(i64 (load (regPlusImm I32:$addr, imm:$off))),
           (LOAD_I64 imm:$off, $addr)>;
-def : Pat<(f32 (load (regPlusImm imm:$off, I32:$addr))),
+def : Pat<(f32 (load (regPlusImm I32:$addr, imm:$off))),
           (LOAD_F32 imm:$off, $addr)>;
-def : Pat<(f64 (load (regPlusImm imm:$off, I32:$addr))),
+def : Pat<(f64 (load (regPlusImm I32:$addr, imm:$off))),
           (LOAD_F64 imm:$off, $addr)>;
-def : Pat<(i32 (load (regPlusImm tglobaladdr:$off, I32:$addr))),
+def : Pat<(i32 (load (regPlusGA I32:$addr,
+                                (WebAssemblywrapper tglobaladdr:$off)))),
           (LOAD_I32 tglobaladdr:$off, $addr)>;
-def : Pat<(i64 (load (regPlusImm tglobaladdr:$off, I32:$addr))),
+def : Pat<(i64 (load (regPlusGA I32:$addr,
+                                (WebAssemblywrapper tglobaladdr:$off)))),
           (LOAD_I64 tglobaladdr:$off, $addr)>;
-def : Pat<(f32 (load (regPlusImm tglobaladdr:$off, I32:$addr))),
+def : Pat<(f32 (load (regPlusGA I32:$addr,
+                                (WebAssemblywrapper tglobaladdr:$off)))),
           (LOAD_F32 tglobaladdr:$off, $addr)>;
-def : Pat<(f64 (load (regPlusImm tglobaladdr:$off, I32:$addr))),
+def : Pat<(f64 (load (regPlusGA I32:$addr,
+                                (WebAssemblywrapper tglobaladdr:$off)))),
           (LOAD_F64 tglobaladdr:$off, $addr)>;
-def : Pat<(i32 (load (regPlusImm texternalsym:$off, I32:$addr))),
+def : Pat<(i32 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
           (LOAD_I32 texternalsym:$off, $addr)>;
-def : Pat<(i64 (load (regPlusImm texternalsym:$off, I32:$addr))),
+def : Pat<(i64 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
           (LOAD_I64 texternalsym:$off, $addr)>;
-def : Pat<(f32 (load (regPlusImm texternalsym:$off, I32:$addr))),
+def : Pat<(f32 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
           (LOAD_F32 texternalsym:$off, $addr)>;
-def : Pat<(f64 (load (regPlusImm texternalsym:$off, I32:$addr))),
+def : Pat<(f64 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
           (LOAD_F64 texternalsym:$off, $addr)>;
 
 // Select loads with just a constant offset.
@@ -135,65 +154,85 @@ def : Pat<(i64 (sextloadi32 I32:$addr)), (LOAD32_S_I64 0, $addr)>;
 def : Pat<(i64 (zextloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr)>;
 
 // Select extending loads with a constant offset.
-def : Pat<(i32 (sextloadi8 (regPlusImm imm:$off, I32:$addr))),
+def : Pat<(i32 (sextloadi8 (regPlusImm I32:$addr, imm:$off))),
           (LOAD8_S_I32 imm:$off, $addr)>;
-def : Pat<(i32 (zextloadi8 (regPlusImm imm:$off, I32:$addr))),
+def : Pat<(i32 (zextloadi8 (regPlusImm I32:$addr, imm:$off))),
           (LOAD8_U_I32 imm:$off, $addr)>;
-def : Pat<(i32 (sextloadi16 (regPlusImm imm:$off, I32:$addr))),
+def : Pat<(i32 (sextloadi16 (regPlusImm I32:$addr, imm:$off))),
           (LOAD16_S_I32 imm:$off, $addr)>;
-def : Pat<(i32 (zextloadi16 (regPlusImm imm:$off, I32:$addr))),
+def : Pat<(i32 (zextloadi16 (regPlusImm I32:$addr, imm:$off))),
           (LOAD16_U_I32 imm:$off, $addr)>;
-def : Pat<(i64 (sextloadi8 (regPlusImm imm:$off, I32:$addr))),
+def : Pat<(i64 (sextloadi8 (regPlusImm I32:$addr, imm:$off))),
           (LOAD8_S_I64 imm:$off, $addr)>;
-def : Pat<(i64 (zextloadi8 (regPlusImm imm:$off, I32:$addr))),
+def : Pat<(i64 (zextloadi8 (regPlusImm I32:$addr, imm:$off))),
           (LOAD8_U_I64 imm:$off, $addr)>;
-def : Pat<(i64 (sextloadi16 (regPlusImm imm:$off, I32:$addr))),
+def : Pat<(i64 (sextloadi16 (regPlusImm I32:$addr, imm:$off))),
           (LOAD16_S_I64 imm:$off, $addr)>;
-def : Pat<(i64 (zextloadi16 (regPlusImm imm:$off, I32:$addr))),
+def : Pat<(i64 (zextloadi16 (regPlusImm I32:$addr, imm:$off))),
           (LOAD16_U_I64 imm:$off, $addr)>;
-def : Pat<(i64 (sextloadi32 (regPlusImm imm:$off, I32:$addr))),
+def : Pat<(i64 (sextloadi32 (regPlusImm I32:$addr, imm:$off))),
           (LOAD32_S_I64 imm:$off, $addr)>;
-def : Pat<(i64 (zextloadi32 (regPlusImm imm:$off, I32:$addr))),
+def : Pat<(i64 (zextloadi32 (regPlusImm I32:$addr, imm:$off))),
           (LOAD32_U_I64 imm:$off, $addr)>;
-def : Pat<(i32 (sextloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))),
+def : Pat<(i32 (sextloadi8 (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off)))),
           (LOAD8_S_I32 tglobaladdr:$off, $addr)>;
-def : Pat<(i32 (zextloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))),
+def : Pat<(i32 (zextloadi8 (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off)))),
           (LOAD8_U_I32 tglobaladdr:$off, $addr)>;
-def : Pat<(i32 (sextloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))),
+def : Pat<(i32 (sextloadi16 (regPlusGA I32:$addr,
+                                       (WebAssemblywrapper tglobaladdr:$off)))),
           (LOAD16_S_I32 tglobaladdr:$off, $addr)>;
-def : Pat<(i32 (zextloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))),
+def : Pat<(i32 (zextloadi16 (regPlusGA I32:$addr,
+                                       (WebAssemblywrapper tglobaladdr:$off)))),
           (LOAD16_U_I32 tglobaladdr:$off, $addr)>;
-def : Pat<(i64 (sextloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))),
+def : Pat<(i64 (sextloadi8 (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off)))),
           (LOAD8_S_I64 tglobaladdr:$off, $addr)>;
-def : Pat<(i64 (zextloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))),
+def : Pat<(i64 (zextloadi8 (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off)))),
           (LOAD8_U_I64 tglobaladdr:$off, $addr)>;
-def : Pat<(i64 (sextloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))),
+def : Pat<(i64 (sextloadi16 (regPlusGA I32:$addr,
+                                       (WebAssemblywrapper tglobaladdr:$off)))),
           (LOAD16_S_I64 tglobaladdr:$off, $addr)>;
-def : Pat<(i64 (zextloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))),
+def : Pat<(i64 (zextloadi16 (regPlusGA I32:$addr,
+                                       (WebAssemblywrapper tglobaladdr:$off)))),
           (LOAD16_U_I64 tglobaladdr:$off, $addr)>;
-def : Pat<(i64 (sextloadi32 (regPlusImm tglobaladdr:$off, I32:$addr))),
+def : Pat<(i64 (sextloadi32 (regPlusGA I32:$addr,
+                                       (WebAssemblywrapper tglobaladdr:$off)))),
           (LOAD32_S_I64 tglobaladdr:$off, $addr)>;
-def : Pat<(i64 (zextloadi32 (regPlusImm tglobaladdr:$off, I32:$addr))),
+def : Pat<(i64 (zextloadi32 (regPlusGA I32:$addr,
+                                       (WebAssemblywrapper tglobaladdr:$off)))),
           (LOAD32_U_I64 tglobaladdr:$off, $addr)>;
-def : Pat<(i32 (sextloadi8 (regPlusImm texternalsym:$off, I32:$addr))),
+def : Pat<(i32 (sextloadi8 (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off)))),
           (LOAD8_S_I32 texternalsym:$off, $addr)>;
-def : Pat<(i32 (zextloadi8 (regPlusImm texternalsym:$off, I32:$addr))),
+def : Pat<(i32 (zextloadi8 (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off)))),
           (LOAD8_U_I32 texternalsym:$off, $addr)>;
-def : Pat<(i32 (sextloadi16 (regPlusImm texternalsym:$off, I32:$addr))),
+def : Pat<(i32 (sextloadi16 (add I32:$addr,
+                                 (WebAssemblywrapper texternalsym:$off)))),
           (LOAD16_S_I32 texternalsym:$off, $addr)>;
-def : Pat<(i32 (zextloadi16 (regPlusImm texternalsym:$off, I32:$addr))),
+def : Pat<(i32 (zextloadi16 (add I32:$addr,
+                                 (WebAssemblywrapper texternalsym:$off)))),
           (LOAD16_U_I32 texternalsym:$off, $addr)>;
-def : Pat<(i64 (sextloadi8 (regPlusImm texternalsym:$off, I32:$addr))),
+def : Pat<(i64 (sextloadi8 (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off)))),
           (LOAD8_S_I64 texternalsym:$off, $addr)>;
-def : Pat<(i64 (zextloadi8 (regPlusImm texternalsym:$off, I32:$addr))),
+def : Pat<(i64 (zextloadi8 (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off)))),
           (LOAD8_U_I64 texternalsym:$off, $addr)>;
-def : Pat<(i64 (sextloadi16 (regPlusImm texternalsym:$off, I32:$addr))),
+def : Pat<(i64 (sextloadi16 (add I32:$addr,
+                                 (WebAssemblywrapper texternalsym:$off)))),
           (LOAD16_S_I64 texternalsym:$off, $addr)>;
-def : Pat<(i64 (zextloadi16 (regPlusImm texternalsym:$off, I32:$addr))),
+def : Pat<(i64 (zextloadi16 (add I32:$addr,
+                                 (WebAssemblywrapper texternalsym:$off)))),
           (LOAD16_U_I64 texternalsym:$off, $addr)>;
-def : Pat<(i64 (sextloadi32 (regPlusImm texternalsym:$off, I32:$addr))),
+def : Pat<(i64 (sextloadi32 (add I32:$addr,
+                                 (WebAssemblywrapper texternalsym:$off)))),
           (LOAD32_S_I64 texternalsym:$off, $addr)>;
-def : Pat<(i64 (zextloadi32 (regPlusImm texternalsym:$off, I32:$addr))),
+def : Pat<(i64 (zextloadi32 (add I32:$addr,
+                                 (WebAssemblywrapper texternalsym:$off)))),
           (LOAD32_U_I64 texternalsym:$off, $addr)>;
 
 // Select extending loads with just a constant offset.
@@ -259,35 +298,45 @@ def : Pat<(i64 (extloadi16 I32:$addr)), (LOAD16_U_I64 0, $addr)>;
 def : Pat<(i64 (extloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr)>;
 
 // Select "don't care" extending loads with a constant offset.
-def : Pat<(i32 (extloadi8 (regPlusImm imm:$off, I32:$addr))),
+def : Pat<(i32 (extloadi8 (regPlusImm I32:$addr, imm:$off))),
           (LOAD8_U_I32 imm:$off, $addr)>;
-def : Pat<(i32 (extloadi16 (regPlusImm imm:$off, I32:$addr))),
+def : Pat<(i32 (extloadi16 (regPlusImm I32:$addr, imm:$off))),
           (LOAD16_U_I32 imm:$off, $addr)>;
-def : Pat<(i64 (extloadi8 (regPlusImm imm:$off, I32:$addr))),
+def : Pat<(i64 (extloadi8 (regPlusImm I32:$addr, imm:$off))),
           (LOAD8_U_I64 imm:$off, $addr)>;
-def : Pat<(i64 (extloadi16 (regPlusImm imm:$off, I32:$addr))),
+def : Pat<(i64 (extloadi16 (regPlusImm I32:$addr, imm:$off))),
           (LOAD16_U_I64 imm:$off, $addr)>;
-def : Pat<(i64 (extloadi32 (regPlusImm imm:$off, I32:$addr))),
+def : Pat<(i64 (extloadi32 (regPlusImm I32:$addr, imm:$off))),
           (LOAD32_U_I64 imm:$off, $addr)>;
-def : Pat<(i32 (extloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))),
+def : Pat<(i32 (extloadi8 (regPlusGA I32:$addr,
+                                     (WebAssemblywrapper tglobaladdr:$off)))),
           (LOAD8_U_I32 tglobaladdr:$off, $addr)>;
-def : Pat<(i32 (extloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))),
+def : Pat<(i32 (extloadi16 (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off)))),
           (LOAD16_U_I32 tglobaladdr:$off, $addr)>;
-def : Pat<(i64 (extloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))),
+def : Pat<(i64 (extloadi8 (regPlusGA I32:$addr,
+                                     (WebAssemblywrapper tglobaladdr:$off)))),
           (LOAD8_U_I64 tglobaladdr:$off, $addr)>;
-def : Pat<(i64 (extloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))),
+def : Pat<(i64 (extloadi16 (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off)))),
           (LOAD16_U_I64 tglobaladdr:$off, $addr)>;
-def : Pat<(i64 (extloadi32 (regPlusImm tglobaladdr:$off, I32:$addr))),
+def : Pat<(i64 (extloadi32 (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off)))),
           (LOAD32_U_I64 tglobaladdr:$off, $addr)>;
-def : Pat<(i32 (extloadi8 (regPlusImm texternalsym:$off, I32:$addr))),
+def : Pat<(i32 (extloadi8 (add I32:$addr,
+                               (WebAssemblywrapper texternalsym:$off)))),
           (LOAD8_U_I32 texternalsym:$off, $addr)>;
-def : Pat<(i32 (extloadi16 (regPlusImm texternalsym:$off, I32:$addr))),
+def : Pat<(i32 (extloadi16 (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off)))),
           (LOAD16_U_I32 texternalsym:$off, $addr)>;
-def : Pat<(i64 (extloadi8 (regPlusImm texternalsym:$off, I32:$addr))),
+def : Pat<(i64 (extloadi8 (add I32:$addr,
+                               (WebAssemblywrapper texternalsym:$off)))),
           (LOAD8_U_I64 texternalsym:$off, $addr)>;
-def : Pat<(i64 (extloadi16 (regPlusImm texternalsym:$off, I32:$addr))),
+def : Pat<(i64 (extloadi16 (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off)))),
           (LOAD16_U_I64 texternalsym:$off, $addr)>;
-def : Pat<(i64 (extloadi32 (regPlusImm texternalsym:$off, I32:$addr))),
+def : Pat<(i64 (extloadi32 (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off)))),
           (LOAD32_U_I64 texternalsym:$off, $addr)>;
 
 // Select "don't care" extending loads with just a constant offset.
@@ -343,29 +392,37 @@ def : Pat<(store F32:$val, I32:$addr), (STORE_F32 0, I32:$addr, F32:$val)>;
 def : Pat<(store F64:$val, I32:$addr), (STORE_F64 0, I32:$addr, F64:$val)>;
 
 // Select stores with a constant offset.
-def : Pat<(store I32:$val, (regPlusImm imm:$off, I32:$addr)),
+def : Pat<(store I32:$val, (regPlusImm I32:$addr, imm:$off)),
           (STORE_I32 imm:$off, I32:$addr, I32:$val)>;
-def : Pat<(store I64:$val, (regPlusImm imm:$off, I32:$addr)),
+def : Pat<(store I64:$val, (regPlusImm I32:$addr, imm:$off)),
           (STORE_I64 imm:$off, I32:$addr, I64:$val)>;
-def : Pat<(store F32:$val, (regPlusImm imm:$off, I32:$addr)),
+def : Pat<(store F32:$val, (regPlusImm I32:$addr, imm:$off)),
           (STORE_F32 imm:$off, I32:$addr, F32:$val)>;
-def : Pat<(store F64:$val, (regPlusImm imm:$off, I32:$addr)),
+def : Pat<(store F64:$val, (regPlusImm I32:$addr, imm:$off)),
           (STORE_F64 imm:$off, I32:$addr, F64:$val)>;
-def : Pat<(store I32:$val, (regPlusImm tglobaladdr:$off, I32:$addr)),
+def : Pat<(store I32:$val, (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off))),
           (STORE_I32 tglobaladdr:$off, I32:$addr, I32:$val)>;
-def : Pat<(store I64:$val, (regPlusImm tglobaladdr:$off, I32:$addr)),
+def : Pat<(store I64:$val, (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off))),
           (STORE_I64 tglobaladdr:$off, I32:$addr, I64:$val)>;
-def : Pat<(store F32:$val, (regPlusImm tglobaladdr:$off, I32:$addr)),
+def : Pat<(store F32:$val, (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off))),
           (STORE_F32 tglobaladdr:$off, I32:$addr, F32:$val)>;
-def : Pat<(store F64:$val, (regPlusImm tglobaladdr:$off, I32:$addr)),
+def : Pat<(store F64:$val, (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off))),
           (STORE_F64 tglobaladdr:$off, I32:$addr, F64:$val)>;
-def : Pat<(store I32:$val, (regPlusImm texternalsym:$off, I32:$addr)),
+def : Pat<(store I32:$val, (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off))),
           (STORE_I32 texternalsym:$off, I32:$addr, I32:$val)>;
-def : Pat<(store I64:$val, (regPlusImm texternalsym:$off, I32:$addr)),
+def : Pat<(store I64:$val, (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off))),
           (STORE_I64 texternalsym:$off, I32:$addr, I64:$val)>;
-def : Pat<(store F32:$val, (regPlusImm texternalsym:$off, I32:$addr)),
+def : Pat<(store F32:$val, (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off))),
           (STORE_F32 texternalsym:$off, I32:$addr, F32:$val)>;
-def : Pat<(store F64:$val, (regPlusImm texternalsym:$off, I32:$addr)),
+def : Pat<(store F64:$val, (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off))),
           (STORE_F64 texternalsym:$off, I32:$addr, F64:$val)>;
 
 // Select stores with just a constant offset.
@@ -423,35 +480,54 @@ def : Pat<(truncstorei32 I64:$val, I32:$addr),
           (STORE32_I64 0, I32:$addr, I64:$val)>;
 
 // Select truncating stores with a constant offset.
-def : Pat<(truncstorei8 I32:$val, (regPlusImm imm:$off, I32:$addr)),
+def : Pat<(truncstorei8 I32:$val, (regPlusImm I32:$addr, imm:$off)),
           (STORE8_I32 imm:$off, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei16 I32:$val, (regPlusImm imm:$off, I32:$addr)),
+def : Pat<(truncstorei16 I32:$val, (regPlusImm I32:$addr, imm:$off)),
           (STORE16_I32 imm:$off, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei8 I64:$val, (regPlusImm imm:$off, I32:$addr)),
+def : Pat<(truncstorei8 I64:$val, (regPlusImm I32:$addr, imm:$off)),
           (STORE8_I64 imm:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei16 I64:$val, (regPlusImm imm:$off, I32:$addr)),
+def : Pat<(truncstorei16 I64:$val, (regPlusImm I32:$addr, imm:$off)),
           (STORE16_I64 imm:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei32 I64:$val, (regPlusImm imm:$off, I32:$addr)),
+def : Pat<(truncstorei32 I64:$val, (regPlusImm I32:$addr, imm:$off)),
           (STORE32_I64 imm:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei8 I32:$val, (regPlusImm tglobaladdr:$off, I32:$addr)),
+def : Pat<(truncstorei8 I32:$val,
+                        (regPlusGA I32:$addr,
+                                   (WebAssemblywrapper tglobaladdr:$off))),
           (STORE8_I32 tglobaladdr:$off, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei16 I32:$val, (regPlusImm tglobaladdr:$off, I32:$addr)),
+def : Pat<(truncstorei16 I32:$val,
+                         (regPlusGA I32:$addr,
+                                    (WebAssemblywrapper tglobaladdr:$off))),
           (STORE16_I32 tglobaladdr:$off, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei8 I64:$val, (regPlusImm tglobaladdr:$off, I32:$addr)),
+def : Pat<(truncstorei8 I64:$val,
+                        (regPlusGA I32:$addr,
+                                   (WebAssemblywrapper tglobaladdr:$off))),
           (STORE8_I64 tglobaladdr:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei16 I64:$val, (regPlusImm tglobaladdr:$off, I32:$addr)),
+def : Pat<(truncstorei16 I64:$val,
+                         (regPlusGA I32:$addr,
+                                    (WebAssemblywrapper tglobaladdr:$off))),
           (STORE16_I64 tglobaladdr:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei32 I64:$val, (regPlusImm tglobaladdr:$off, I32:$addr)),
+def : Pat<(truncstorei32 I64:$val,
+                         (regPlusGA I32:$addr,
+                                    (WebAssemblywrapper tglobaladdr:$off))),
           (STORE32_I64 tglobaladdr:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei8 I32:$val, (regPlusImm texternalsym:$off, I32:$addr)),
+def : Pat<(truncstorei8 I32:$val, (add I32:$addr,
+                                       (WebAssemblywrapper texternalsym:$off))),
           (STORE8_I32 texternalsym:$off, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei16 I32:$val, (regPlusImm texternalsym:$off, I32:$addr)),
+def : Pat<(truncstorei16 I32:$val,
+                         (add I32:$addr,
+                              (WebAssemblywrapper texternalsym:$off))),
           (STORE16_I32 texternalsym:$off, I32:$addr, I32:$val)>;
-def : Pat<(truncstorei8 I64:$val, (regPlusImm texternalsym:$off, I32:$addr)),
+def : Pat<(truncstorei8 I64:$val,
+                        (add I32:$addr,
+                             (WebAssemblywrapper texternalsym:$off))),
           (STORE8_I64 texternalsym:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei16 I64:$val, (regPlusImm texternalsym:$off, I32:$addr)),
+def : Pat<(truncstorei16 I64:$val,
+                         (add I32:$addr,
+                              (WebAssemblywrapper texternalsym:$off))),
           (STORE16_I64 texternalsym:$off, I32:$addr, I64:$val)>;
-def : Pat<(truncstorei32 I64:$val, (regPlusImm texternalsym:$off, I32:$addr)),
+def : Pat<(truncstorei32 I64:$val,
+                         (add I32:$addr,
+                              (WebAssemblywrapper texternalsym:$off))),
           (STORE32_I64 texternalsym:$off, I32:$addr, I64:$val)>;
 
 // Select truncating stores with just a constant offset.
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index a953f8247006..022a448590ec 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -36,15 +36,17 @@ MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
   return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
 }
 
-MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
-                                                     MCSymbol *Sym) const {
-  assert(MO.getTargetFlags() == 0 && "WebAssembly does not use target flags");
+MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(MCSymbol *Sym,
+                                                     int64_t Offset,
+                                                     bool IsFunc) const {
+  MCSymbolRefExpr::VariantKind VK =
+      IsFunc ? MCSymbolRefExpr::VK_WebAssembly_FUNCTION
+             : MCSymbolRefExpr::VK_None;
+  const MCExpr *Expr = MCSymbolRefExpr::create(Sym, VK, Ctx);
 
-  const MCExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
-
-  int64_t Offset = MO.getOffset();
   if (Offset != 0) {
-    assert(!MO.isJTI() && "Unexpected offset with jump table index");
+    if (IsFunc)
+      report_fatal_error("Function addresses with offsets not supported");
     Expr =
         MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(Offset, Ctx), Ctx);
   }
@@ -64,6 +66,9 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
     default:
       MI->dump();
       llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_MachineBasicBlock:
+      MI->dump();
+      llvm_unreachable("MachineBasicBlock operand should have been rewritten");
     case MachineOperand::MO_Register: {
       // Ignore all implicit register operands.
       if (MO.isImplicit())
@@ -89,15 +94,19 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
         llvm_unreachable("unknown floating point immediate type");
       break;
     }
-    case MachineOperand::MO_MachineBasicBlock:
-      MCOp = MCOperand::createExpr(
-          MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx));
-      break;
     case MachineOperand::MO_GlobalAddress:
-      MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+      assert(MO.getTargetFlags() == 0 &&
+             "WebAssembly does not use target flags on GlobalAddresses");
+      MCOp = LowerSymbolOperand(GetGlobalAddressSymbol(MO), MO.getOffset(),
+                                MO.getGlobal()->getValueType()->isFunctionTy());
       break;
     case MachineOperand::MO_ExternalSymbol:
-      MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+      // The target flag indicates whether this is a symbol for a
+      // variable or a function.
+      assert((MO.getTargetFlags() & -2) == 0 &&
+             "WebAssembly uses only one target flag bit on ExternalSymbols");
+      MCOp = LowerSymbolOperand(GetExternalSymbolSymbol(MO), /*Offset=*/0,
+                                MO.getTargetFlags() & 1);
       break;
     }
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
index 6d704704f576..ab4ba1c28d53 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
@@ -31,9 +31,10 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower {
   MCContext &Ctx;
   AsmPrinter &Printer;
 
-  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
   MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
   MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+  MCOperand LowerSymbolOperand(MCSymbol *Sym, int64_t Offset,
+                               bool IsFunc) const;
 
 public:
   WebAssemblyMCInstLower(MCContext &ctx, AsmPrinter &printer)
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index 89ef5cdb2bef..537c147e6142 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -147,8 +147,10 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
   // block boundaries, and the blocks aren't ordered so the block visitation
   // order isn't significant, but we may want to change this in the future.
   for (MachineBasicBlock &MBB : MF) {
-    for (MachineInstr &MI : reverse(MBB)) {
-      MachineInstr *Insert = &MI;
+    // Don't use a range-based for loop, because we modify the list as we're
+    // iterating over it and the end iterator may change.
+    for (auto MII = MBB.rbegin(); MII != MBB.rend(); ++MII) {
+      MachineInstr *Insert = &*MII;
       // Don't nest anything inside a phi.
       if (Insert->getOpcode() == TargetOpcode::PHI)
         break;
@@ -221,7 +223,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
         Insert = Def;
       }
       if (AnyStackified)
-        ImposeStackOrdering(&MI);
+        ImposeStackOrdering(&*MII);
     }
   }
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
index dcada45f96d1..90d8dda530ba 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
@@ -61,17 +61,23 @@ void WebAssemblyRegisterInfo::eliminateFrameIndex(
   MachineFunction &MF = *MBB.getParent();
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
   const MachineFrameInfo& MFI = *MF.getFrameInfo();
-  int FrameOffset = MFI.getStackSize() + MFI.getObjectOffset(FrameIndex);
+  int64_t FrameOffset = MFI.getStackSize() + MFI.getObjectOffset(FrameIndex);
 
   if (MI.mayLoadOrStore()) {
     // If this is a load or store, make it relative to SP and fold the frame
-    // offset directly in
-    assert(MI.getOperand(1).getImm() == 0 &&
-           "Can't eliminate FI yet if offset is already set");
-    MI.getOperand(1).setImm(FrameOffset);
+    // offset directly in.
+    assert(FrameOffset >= 0 && MI.getOperand(1).getImm() >= 0);
+    int64_t Offset = MI.getOperand(1).getImm() + FrameOffset;
+
+    if (static_cast<uint64_t>(Offset) > std::numeric_limits<uint32_t>::max()) {
+      // If this happens the program is invalid, but better to error here than
+      // generate broken code.
+      report_fatal_error("Memory offset field overflow");
+    }
+    MI.getOperand(1).setImm(Offset);
     MI.getOperand(2).ChangeToRegister(WebAssembly::SP32, /*IsDef=*/false);
   } else {
-    // Otherwise create an i32.add SP, offset and make it the operand
+    // Otherwise create an i32.add SP, offset and make it the operand.
     auto &MRI = MF.getRegInfo();
     const auto *TII = MF.getSubtarget().getInstrInfo();
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index e31ea46de9f5..b290b4bf7440 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -45,8 +45,9 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine(
     const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
     const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM,
     CodeGenOpt::Level OL)
-    : LLVMTargetMachine(T, TT.isArch64Bit() ? "e-p:64:64-i64:64-n32:64-S128"
-                                            : "e-p:32:32-i64:64-n32:64-S128",
+    : LLVMTargetMachine(T,
+                        TT.isArch64Bit() ? "e-m:e-p:64:64-i64:64-n32:64-S128"
+                                         : "e-m:e-p:32:32-i64:64-n32:64-S128",
                         TT, CPU, FS, Options, RM, CM, OL),
       TLOF(make_unique<WebAssemblyTargetObjectFile>()) {
   // WebAssembly type-checks expressions, but a noreturn function with a return
diff --git a/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt b/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt
index 92ecde3f90d6..91b3fff05dca 100644
--- a/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt
+++ b/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt
@@ -5,23 +5,6 @@
 pr38151.c
 va-arg-22.c
 
-# WebAssemblyRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator, int, unsigned int, llvm::RegScavenger *) const: Assertion `MI.getOperand(1).getImm() == 0 && "Can't eliminate FI yet if offset is already set"'
-20030313-1.c
-20030916-1.c
-20031012-1.c
-20041126-1.c
-20060420-1.c
-20071202-1.c
-20120808-1.c
-pr20527-1.c
-pr27073.c
-pr36339.c
-pr37573.c
-pr43236.c
-pr43835.c
-pr45070.c
-pr51933.c
-
 # TargetRegisterInfo.h:315: static unsigned int llvm::TargetRegisterInfo::virtReg2Index(unsigned int): Assertion `isVirtualRegister(Reg) && "Not a virtual register"' failed.
 struct-ret-1.c
 va-arg-11.c
@@ -140,8 +123,6 @@ pr38051.c
 pr39100.c
 
 pr39339.c
-pr40022.c
-pr40657.c
 
 pr43987.c
 
diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h
index fbec6626d99d..01e65b89f480 100644
--- a/contrib/llvm/lib/Target/X86/X86.h
+++ b/contrib/llvm/lib/Target/X86/X86.h
@@ -29,7 +29,7 @@ FunctionPass *createX86ISelDag(X86TargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
 
 /// This pass initializes a global base register for PIC on x86-32.
-FunctionPass* createX86GlobalBaseRegPass();
+FunctionPass *createX86GlobalBaseRegPass();
 
 /// This pass combines multiple accesses to local-dynamic TLS variables so that
 /// the TLS base address for the module is only fetched once per execution path
@@ -49,12 +49,13 @@ FunctionPass *createX86IssueVZeroUpperPass();
 /// This will prevent a stall when returning on the Atom.
 FunctionPass *createX86PadShortFunctions();
 
-/// Return a a pass that selectively replaces certain instructions (like add,
+/// Return a pass that selectively replaces certain instructions (like add,
 /// sub, inc, dec, some shifts, and some multiplies) by equivalent LEA
 /// instructions, in order to eliminate execution delays in some processors.
 FunctionPass *createX86FixupLEAs();
 
-/// Return a pass that removes redundant address recalculations.
+/// Return a pass that removes redundant LEA instructions and redundant address
+/// recalculations.
 FunctionPass *createX86OptimizeLEAs();
 
 /// Return a pass that optimizes the code-size of x86 call sequences. This is
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td
index 54d88cbb244e..e8b96e74a7af 100644
--- a/contrib/llvm/lib/Target/X86/X86CallingConv.td
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td
@@ -831,6 +831,12 @@ def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15,
 def CSR_64_TLS_Darwin : CalleeSavedRegs<(add CSR_64, RCX, RDX, RSI,
                                              R8, R9, R10, R11)>;
 
+// CSRs that are handled by prologue, epilogue.
+def CSR_64_CXX_TLS_Darwin_PE : CalleeSavedRegs<(add)>;
+
+// CSRs that are handled explicitly via copies.
+def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(add CSR_64_TLS_Darwin)>;
+
 // All GPRs - except r11
 def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI,
                                               R8, R9, R10, RSP)>;
diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
index 629d4d3565f2..f48b47934e03 100644
--- a/contrib/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
@@ -1002,6 +1002,9 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
   if (!FuncInfo.CanLowerReturn)
     return false;
 
+  if (TLI.supportSplitCSR(FuncInfo.MF))
+    return false;
+
   CallingConv::ID CC = F.getCallingConv();
   if (CC != CallingConv::C &&
       CC != CallingConv::Fast &&
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
index d31aab0fa141..1ec93b5f2d23 100644
--- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -265,7 +265,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       // Without SSE, i64->f64 goes through memory.
       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
     }
-  }
+  } else if (!Subtarget->is64Bit())
+    setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
 
   // Scalar integer divide and remainder are lowered to use operations that
   // produce two results, to match the available instructions. This exposes
@@ -2310,6 +2311,18 @@ X86TargetLowering::LowerReturn(SDValue Chain,
         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
   }
 
+  const X86RegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const MCPhysReg *I =
+      TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+  if (I) {
+    for (; *I; ++I) {
+      if (X86::GR64RegClass.contains(*I))
+        RetOps.push_back(DAG.getRegister(*I, MVT::i64));
+      else
+        llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+    }
+  }
+
   RetOps[0] = Chain;  // Update chain.
 
   // Add the flag if we have it.
@@ -3907,6 +3920,7 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::PSHUFHW:
   case X86ISD::PSHUFLW:
   case X86ISD::SHUFP:
+  case X86ISD::INSERTPS:
   case X86ISD::PALIGNR:
   case X86ISD::MOVLHPS:
   case X86ISD::MOVLHPD:
@@ -4157,6 +4171,35 @@ static bool hasFPCMov(unsigned X86CC) {
   }
 }
 
+
+bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                           const CallInst &I,
+                                           unsigned Intrinsic) const {
+
+  const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
+  if (!IntrData)
+    return false;
+
+  switch (IntrData->Type) {
+  case LOADA:
+  case LOADU: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(I.getType());
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = (IntrData->Type == LOADA ? Info.memVT.getSizeInBits()/8 : 1);
+    Info.vol = false;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  default:
+    break;
+  }
+
+  return false;
+}
+
 /// Returns true if the target can instruction select the
 /// specified FP immediate natively. If false, the legalizer will
 /// materialize the FP immediate as a load from a constant pool.
@@ -4743,8 +4786,7 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
 /// uses one source. Note that this will set IsUnary for shuffles which use a
 /// single input multiple times, and in those cases it will
 /// adjust the mask to only have indices within that single input.
-/// FIXME: Add support for Decode*Mask functions that return SM_SentinelZero.
-static bool getTargetShuffleMask(SDNode *N, MVT VT,
+static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
   unsigned NumElems = VT.getVectorNumElements();
   SDValue ImmN;
@@ -4761,6 +4803,11 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
+  case X86ISD::INSERTPS:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+    break;
   case X86ISD::UNPCKH:
     DecodeUNPCKHMask(VT, Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
@@ -4870,10 +4917,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
   case X86ISD::VPERM2X128:
     ImmN = N->getOperand(N->getNumOperands()-1);
     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
-    // Mask only contains negative index if an element is zero.
-    if (std::any_of(Mask.begin(), Mask.end(),
-                    [](int M){ return M == SM_SentinelZero; }))
-      return false;
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::MOVSLDUP:
     DecodeMOVSLDUPMask(VT, Mask);
@@ -5008,6 +5052,12 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
   if (Mask.empty())
     return false;
 
+  // Check if we're getting a shuffle mask with zero'd elements.
+  if (!AllowSentinelZero)
+    if (std::any_of(Mask.begin(), Mask.end(),
+                    [](int M){ return M == SM_SentinelZero; }))
+      return false;
+
   // If we have a fake unary shuffle, the shuffle mask is spread across two
   // inputs that are actually the same node. Re-map the mask to always point
   // into the first input.
@@ -5046,19 +5096,19 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
   // Recurse into target specific vector shuffles to find scalars.
   if (isTargetShuffle(Opcode)) {
     MVT ShufVT = V.getSimpleValueType();
-    unsigned NumElems = ShufVT.getVectorNumElements();
+    int NumElems = (int)ShufVT.getVectorNumElements();
     SmallVector<int, 16> ShuffleMask;
     bool IsUnary;
 
-    if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
+    if (!getTargetShuffleMask(N, ShufVT, false, ShuffleMask, IsUnary))
       return SDValue();
 
     int Elt = ShuffleMask[Index];
-    if (Elt < 0)
+    if (Elt == SM_SentinelUndef)
       return DAG.getUNDEF(ShufVT.getVectorElementType());
 
-    SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
-                                         : N->getOperand(1);
+    assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
+    SDValue NewV = (Elt < NumElems) ? N->getOperand(0) : N->getOperand(1);
     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
                                Depth+1);
   }
@@ -8165,6 +8215,13 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
             DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
       return TruncBroadcast;
 
+  MVT BroadcastVT = VT;
+
+  // Peek through any bitcast (only useful for loads).
+  SDValue BC = V;
+  while (BC.getOpcode() == ISD::BITCAST)
+    BC = BC.getOperand(0);
+
   // Also check the simpler case, where we can directly reuse the scalar.
   if (V.getOpcode() == ISD::BUILD_VECTOR ||
       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
@@ -8174,13 +8231,17 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
     // Only AVX2 has register broadcasts.
     if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
       return SDValue();
-  } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) {
+  } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
+    // 32-bit targets need to load i64 as a f64 and then bitcast the result.
+    if (!Subtarget->is64Bit() && VT.getScalarType() == MVT::i64)
+      BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
+
     // If we are broadcasting a load that is only used by the shuffle
     // then we can reduce the vector load to the broadcasted scalar load.
-    LoadSDNode *Ld = cast<LoadSDNode>(V);
+    LoadSDNode *Ld = cast<LoadSDNode>(BC);
     SDValue BaseAddr = Ld->getOperand(1);
     EVT AddrVT = BaseAddr.getValueType();
-    EVT SVT = VT.getScalarType();
+    EVT SVT = BroadcastVT.getScalarType();
     unsigned Offset = BroadcastIdx * SVT.getStoreSize();
     SDValue NewAddr = DAG.getNode(
         ISD::ADD, DL, AddrVT, BaseAddr,
@@ -8194,7 +8255,8 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
     return SDValue();
   }
 
-  return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
+  V = DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, V);
+  return DAG.getBitcast(VT, V);
 }
 
 // Check for whether we can use INSERTPS to perform the shuffle. We only use
@@ -12474,8 +12536,12 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     // location.
     SDValue Chain = DAG.getEntryNode();
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
     SDValue Args[] = { Chain, Offset };
     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
+    Chain =
+        DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
+                           DAG.getIntPtrConstant(0, DL, true), SDValue(), DL);
 
     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
@@ -12648,13 +12714,21 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
     return Op;
   }
 
+  SDValue ValueToStore = Op.getOperand(0);
+  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
+      !Subtarget->is64Bit())
+    // Bitcasting to f64 here allows us to do a single 64-bit store from
+    // an SSE register, avoiding the store forwarding penalty that would come
+    // with two 32-bit stores.
+    ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
+
   unsigned Size = SrcVT.getSizeInBits()/8;
   MachineFunction &MF = DAG.getMachineFunction();
   auto PtrVT = getPointerTy(MF.getDataLayout());
   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
   SDValue Chain = DAG.getStore(
-      DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot,
+      DAG.getEntryNode(), dl, ValueToStore, StackSlot,
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false,
       false, 0);
   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
@@ -13027,7 +13101,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   }
 
   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
-  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+  SDValue ValueToStore = Op.getOperand(0);
+  if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget->is64Bit())
+    // Bitcasting to f64 here allows us to do a single 64-bit store from
+    // an SSE register, avoiding the store forwarding penalty that would come
+    // with two 32-bit stores.
+    ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore,
                                StackSlot, MachinePointerInfo(),
                                false, false, 0);
   // For i64 source, we need to add the appropriate power of 2 if the input
@@ -17487,7 +17567,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
     return DAG.getMergeValues(Results, dl);
   }
   case COMPRESS_TO_MEM: {
-    SDLoc dl(Op);
     SDValue Mask = Op.getOperand(4);
     SDValue DataToCompress = Op.getOperand(3);
     SDValue Addr = Op.getOperand(2);
@@ -17513,7 +17592,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
   case TRUNCATE_TO_MEM_VI32:
     return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i32);
   case EXPAND_FROM_MEM: {
-    SDLoc dl(Op);
     SDValue Mask = Op.getOperand(4);
     SDValue PassThru = Op.getOperand(3);
     SDValue Addr = Op.getOperand(2);
@@ -17533,6 +17611,25 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
                            Mask, PassThru, Subtarget, DAG), Chain};
     return DAG.getMergeValues(Results, dl);
   }
+  case LOADU:
+  case LOADA: {
+    SDValue Mask = Op.getOperand(4);
+    SDValue PassThru = Op.getOperand(3);
+    SDValue Addr = Op.getOperand(2);
+    SDValue Chain = Op.getOperand(0);
+    MVT VT = Op.getSimpleValueType();
+
+    MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
+    assert(MemIntr && "Expected MemIntrinsicSDNode!");
+
+    if (isAllOnesConstant(Mask)) // return just a load
+      return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
+
+    MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+    SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+    return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
+                             MemIntr->getMemOperand(), ISD::NON_EXTLOAD);
+  }
   }
 }
 
@@ -19512,24 +19609,37 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
   MVT DstVT = Op.getSimpleValueType();
 
-  if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
+  if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
+      SrcVT == MVT::i64) {
     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
     if (DstVT != MVT::f64)
       // This conversion needs to be expanded.
       return SDValue();
 
-    SDValue InVec = Op->getOperand(0);
-    SDLoc dl(Op);
-    unsigned NumElts = SrcVT.getVectorNumElements();
-    MVT SVT = SrcVT.getVectorElementType();
-
-    // Widen the vector in input in the case of MVT::v2i32.
-    // Example: from MVT::v2i32 to MVT::v4i32.
+    SDValue Op0 = Op->getOperand(0);
     SmallVector<SDValue, 16> Elts;
-    for (unsigned i = 0, e = NumElts; i != e; ++i)
-      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
-                                 DAG.getIntPtrConstant(i, dl)));
-
+    SDLoc dl(Op);
+    unsigned NumElts;
+    MVT SVT;
+    if (SrcVT.isVector()) {
+      NumElts = SrcVT.getVectorNumElements();
+      SVT = SrcVT.getVectorElementType();
+
+      // Widen the vector in input in the case of MVT::v2i32.
+      // Example: from MVT::v2i32 to MVT::v4i32.
+      for (unsigned i = 0, e = NumElts; i != e; ++i)
+        Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
+                                   DAG.getIntPtrConstant(i, dl)));
+    } else {
+      assert(SrcVT == MVT::i64 && !Subtarget->is64Bit() &&
+             "Unexpected source type in LowerBITCAST");
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
+                                 DAG.getIntPtrConstant(0, dl)));
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
+                                 DAG.getIntPtrConstant(1, dl)));
+      NumElts = 2;
+      SVT = MVT::i32;
+    }
     // Explicitly mark the extra elements as Undef.
     Elts.append(NumElts, DAG.getUNDEF(SVT));
 
@@ -20685,6 +20795,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
+  case X86ISD::VROTLI:             return "X86ISD::VROTLI";
+  case X86ISD::VROTRI:             return "X86ISD::VROTRI";
   case X86ISD::CMPP:               return "X86ISD::CMPP";
   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
@@ -23184,7 +23296,7 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
     return false;
   SmallVector<int, 16> OpMask;
   bool IsUnary;
-  bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary);
+  bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, true, OpMask, IsUnary);
   // We only can combine unary shuffles which we can decode the mask for.
   if (!HaveMask || !IsUnary)
     return false;
@@ -23281,7 +23393,7 @@ static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
   MVT VT = N.getSimpleValueType();
   SmallVector<int, 4> Mask;
   bool IsUnary;
-  bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary);
+  bool HaveMask = getTargetShuffleMask(N.getNode(), VT, false, Mask, IsUnary);
   (void)HaveMask;
   assert(HaveMask);
 
@@ -23854,6 +23966,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
 
   SDValue InVec = N->getOperand(0);
   SDValue EltNo = N->getOperand(1);
+  EVT EltVT = N->getValueType(0);
 
   if (!isa<ConstantSDNode>(EltNo))
     return SDValue();
@@ -23882,14 +23995,22 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
 
   SmallVector<int, 16> ShuffleMask;
   bool UnaryShuffle;
-  if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),
+  if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
                             ShuffleMask, UnaryShuffle))
     return SDValue();
 
   // Select the input vector, guarding against out of range extract vector.
   unsigned NumElems = CurrentVT.getVectorNumElements();
   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
-  int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
+  int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
+
+  if (Idx == SM_SentinelZero)
+    return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
+                             : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
+  if (Idx == SM_SentinelUndef)
+    return DAG.getUNDEF(EltVT);
+
+  assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
   SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
                                          : InVec.getOperand(1);
 
@@ -23914,7 +24035,6 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
     return SDValue();
 
-  EVT EltVT = N->getValueType(0);
   // If there's a bitcast before the shuffle, check if the load type and
   // alignment is valid.
   unsigned Align = LN0->getAlignment();
@@ -27233,6 +27353,32 @@ static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG,
   return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags);
 }
 
+/// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
+/// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
+/// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
+/// extends from AH (which we otherwise need to do contortions to access).
+static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  auto OpcodeN = N->getOpcode();
+  auto OpcodeN0 = N0.getOpcode();
+  if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
+        (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  EVT InVT = N0.getValueType();
+  if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
+    return SDValue();
+
+  SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
+  auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
+                                               : X86ISD::UDIVREM8_ZEXT_HREG;
+  SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
+                          N0.getOperand(1));
+  DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
+  return R.getValue(1);
+}
+
 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const X86Subtarget *Subtarget) {
@@ -27243,18 +27389,8 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
   EVT InSVT = InVT.getScalarType();
   SDLoc DL(N);
 
-  // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
-  // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
-  // This exposes the sext to the sdivrem lowering, so that it directly extends
-  // from AH (which we otherwise need to do contortions to access).
-  if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
-      InVT == MVT::i8 && VT == MVT::i32) {
-    SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
-    SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, DL, NodeTys,
-                            N0.getOperand(0), N0.getOperand(1));
-    DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
-    return R.getValue(1);
-  }
+  if (SDValue DivRem8 = getDivRem8(N, DAG))
+    return DivRem8;
 
   if (!DCI.isBeforeLegalizeOps()) {
     if (InVT == MVT::i1) {
@@ -27413,19 +27549,8 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
     if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
       return R;
 
-  // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
-  // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
-  // This exposes the zext to the udivrem lowering, so that it directly extends
-  // from AH (which we otherwise need to do contortions to access).
-  if (N0.getOpcode() == ISD::UDIVREM &&
-      N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
-      VT == MVT::i32) {
-    SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
-    SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
-                            N0.getOperand(0), N0.getOperand(1));
-    DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
-    return R.getValue(1);
-  }
+  if (SDValue DivRem8 = getDivRem8(N, DAG))
+    return DivRem8;
 
   return SDValue();
 }
@@ -27923,7 +28048,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG, Subtarget);
   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
-// TODO: refactor the [SU]DIVREM8_[SZ]EXT_HREG code so that it's not duplicated.
   case ISD::ANY_EXTEND:
   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
@@ -28763,3 +28887,51 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
                                    Attribute::MinSize);
   return OptSize && !VT.isVector();
 }
+
+void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+  if (!Subtarget->is64Bit())
+    return;
+
+  // Update IsSplitCSR in X86MachineFunctionInfo.
+  X86MachineFunctionInfo *AFI =
+    Entry->getParent()->getInfo<X86MachineFunctionInfo>();
+  AFI->setIsSplitCSR(true);
+}
+
+void X86TargetLowering::insertCopiesSplitCSR(
+    MachineBasicBlock *Entry,
+    const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+  const X86RegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+  if (!IStart)
+    return;
+
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+  for (const MCPhysReg *I = IStart; *I; ++I) {
+    const TargetRegisterClass *RC = nullptr;
+    if (X86::GR64RegClass.contains(*I))
+      RC = &X86::GR64RegClass;
+    else
+      llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+    unsigned NewVR = MRI->createVirtualRegister(RC);
+    // Create copy from CSR to a virtual register.
+    // FIXME: this currently does not emit CFI pseudo-instructions, it works
+    // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
+    // nounwind. If we want to generalize this later, we may need to emit
+    // CFI pseudo-instructions.
+    assert(Entry->getParent()->getFunction()->hasFnAttribute(
+               Attribute::NoUnwind) &&
+           "Function should be nounwind in insertCopiesSplitCSR!");
+    Entry->addLiveIn(*I);
+    BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
+            NewVR)
+        .addReg(*I);
+
+    for (auto *Exit : Exits)
+      BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
+              *I)
+          .addReg(NewVR);
+  }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
index 8bb0e5f8bd36..0ab786e08e02 100644
--- a/contrib/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
@@ -316,6 +316,9 @@ namespace llvm {
       // Vector shift elements by immediate
       VSHLI, VSRLI, VSRAI,
 
+      // Bit rotate by immediate
+      VROTLI, VROTRI,
+
       // Vector packed double/float comparison.
       CMPP,
 
@@ -837,6 +840,13 @@ namespace llvm {
     /// from i32 to i8 but not from i32 to i16.
     bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
 
+    /// Given an intrinsic, checks if on the target the intrinsic will need to map
+    /// to a MemIntrinsicNode (touches memory). If this is the case, it returns
+    /// true and stores the intrinsic information into the IntrinsicInfo that was
+    /// passed to the function.
+    bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+                            unsigned Intrinsic) const override;
+
     /// Returns true if the target can instruction select the
     /// specified FP immediate natively. If false, the legalizer will
     /// materialize the FP immediate as a load from a constant pool.
@@ -1057,6 +1067,15 @@ namespace llvm {
                         const SmallVectorImpl<SDValue> &OutVals,
                         SDLoc dl, SelectionDAG &DAG) const override;
 
+    bool supportSplitCSR(MachineFunction *MF) const override {
+      return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+          MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
+    }
+    void initializeSplitCSR(MachineBasicBlock *Entry) const override;
+    void insertCopiesSplitCSR(
+      MachineBasicBlock *Entry,
+      const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+
     bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
 
     bool mayBeEmittedAsTailCall(CallInst *CI) const override;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
index 0a27c33f033e..49be64883939 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -188,7 +188,7 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
   let isCommutable = IsCommutable in
     def NAME: AVX512<O, F, Outs, Ins,
                        OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
-                                     "$dst , "#IntelSrcAsm#"}",
+                                     "$dst, "#IntelSrcAsm#"}",
                        Pattern, itin>;
 
   // Prefer over VMOV*rrk Pat<>
@@ -323,18 +323,16 @@ multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F,
                                   string OpcodeStr,
                                   string AttSrcAsm, string IntelSrcAsm,
                                   list<dag> Pattern,
-                                  list<dag> MaskingPattern,
-                                  string Round = "",
-                                  InstrItinClass itin = NoItinerary> {
+                                  list<dag> MaskingPattern> {
     def NAME: AVX512<O, F, Outs, Ins,
-                       OpcodeStr#"\t{"#AttSrcAsm#", $dst "#Round#"|"#
-                                     "$dst "#Round#", "#IntelSrcAsm#"}",
-                       Pattern, itin>;
+                       OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
+                                     "$dst, "#IntelSrcAsm#"}",
+                       Pattern, NoItinerary>;
 
     def NAME#k: AVX512<O, F, Outs, MaskingIns,
-                       OpcodeStr#"\t{"#Round#AttSrcAsm#", $dst {${mask}}|"#
-                                     "$dst {${mask}}, "#IntelSrcAsm#Round#"}",
-                       MaskingPattern, itin>, EVEX_K;
+                       OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
+                                     "$dst {${mask}}, "#IntelSrcAsm#"}",
+                       MaskingPattern, NoItinerary>, EVEX_K;
 }
 
 multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
@@ -342,33 +340,27 @@ multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
                                   dag Ins, dag MaskingIns,
                                   string OpcodeStr,
                                   string AttSrcAsm, string IntelSrcAsm,
-                                  dag RHS, dag MaskingRHS,
-                                  string Round = "",
-                                  InstrItinClass itin = NoItinerary> :
+                                  dag RHS, dag MaskingRHS> :
   AVX512_maskable_custom_cmp<O, F, Outs, Ins, MaskingIns, OpcodeStr,
                          AttSrcAsm, IntelSrcAsm,
                          [(set _.KRC:$dst, RHS)],
-                         [(set _.KRC:$dst, MaskingRHS)],
-                         Round, NoItinerary>;
+                         [(set _.KRC:$dst, MaskingRHS)]>;
 
 multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _,
                            dag Outs, dag Ins, string OpcodeStr,
                            string AttSrcAsm, string IntelSrcAsm,
-                           dag RHS, string Round = "",
-                           InstrItinClass itin = NoItinerary> :
+                           dag RHS> :
    AVX512_maskable_common_cmp<O, F, _, Outs, Ins,
                           !con((ins _.KRCWM:$mask), Ins),
                           OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
-                          (and _.KRCWM:$mask, RHS),
-                          Round, itin>;
+                          (and _.KRCWM:$mask, RHS)>;
 
 multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _,
                            dag Outs, dag Ins, string OpcodeStr,
                            string AttSrcAsm, string IntelSrcAsm> :
    AVX512_maskable_custom_cmp<O, F, Outs,
                              Ins, !con((ins _.KRCWM:$mask),Ins), OpcodeStr,
-                             AttSrcAsm, IntelSrcAsm,
-                             [],[],"", NoItinerary>;
+                             AttSrcAsm, IntelSrcAsm, [],[]>;
 
 // Bitcasts between 512-bit vector types. Return the original type since
 // no instruction is needed for the conversion
@@ -1294,7 +1286,7 @@ multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
   def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
              (ins _.RC:$src1, _.RC:$src2),
              !strconcat(OpcodeStr,
-             "\t{$src2, $src1, ${dst} |${dst}, $src1, $src2}"),
+             "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"),
              []>, EVEX_4V;
   def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
              (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
@@ -1311,7 +1303,7 @@ multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
   def rm  : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
              (ins _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr,
-             "\t{$src2, $src1, ${dst} |${dst},  $src1, $src2}"),
+             "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"),
              []>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
   def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
              (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
@@ -1426,7 +1418,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>
                      (outs _.KRC:$dst),
                      (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
                      "vcmp${cc}"#_.Suffix,
-                     "{sae}, $src2, $src1", "$src1, $src2,{sae}",
+                     "{sae}, $src2, $src1", "$src1, $src2, {sae}",
                      (OpNodeRnd (_.VT _.RC:$src1),
                                 (_.VT _.RC:$src2),
                                 imm:$cc,
@@ -1449,7 +1441,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>
                        (outs _.KRC:$dst),
                        (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
                        "vcmp"#_.Suffix,
-                       "$cc,{sae}, $src2, $src1","$src1, $src2,{sae}, $cc">,
+                       "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc">,
                        EVEX_4V, EVEX_B;
   }// let isAsmParserOnly = 1, hasSideEffects = 0
 
@@ -1831,7 +1823,7 @@ multiclass avx512_vcmp_sae<X86VectorVTInfo _> {
   defm  rrib  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
                      (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
                      "vcmp${cc}"#_.Suffix,
-                     "{sae}, $src2, $src1", "$src1, $src2,{sae}",
+                     "{sae}, $src2, $src1", "$src1, $src2, {sae}",
                      (X86cmpmRnd (_.VT _.RC:$src1),
                                     (_.VT _.RC:$src2),
                                     imm:$cc,
@@ -1842,8 +1834,8 @@ multiclass avx512_vcmp_sae<X86VectorVTInfo _> {
                          (outs _.KRC:$dst),
                          (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
                          "vcmp"#_.Suffix,
-                         "$cc,{sae}, $src2, $src1",
-                         "$src1, $src2,{sae}, $cc">, EVEX_B;
+                         "$cc, {sae}, $src2, $src1",
+                         "$src1, $src2, {sae}, $cc">, EVEX_B;
    }
 }
 
@@ -1889,13 +1881,13 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
   let Predicates = [prd] in {
       def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),//_.KRC:$dst),
                       (ins _.RC:$src1, i32u8imm:$src2),
-                      OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst | $dst, $src1, $src2}",
+                      OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
                               (i32 imm:$src2)))], NoItinerary>;
       def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
                       (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix#
-                      "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}",
+                      "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
                       [(set _.KRC:$dst,(or _.KRCWM:$mask, 
                                       (OpNode (_.VT _.RC:$src1),
                                       (i32 imm:$src2))))], NoItinerary>, EVEX_K;
@@ -1903,14 +1895,14 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
       def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                       (ins _.MemOp:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix##
-                                "\t{$src2, $src1, $dst | $dst, $src1, $src2}",
+                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set _.KRC:$dst,
                             (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
                                     (i32 imm:$src2)))], NoItinerary>;
       def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                       (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix##
-                      "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}",
+                      "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
                       [(set _.KRC:$dst,(or _.KRCWM:$mask, 
                           (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
                               (i32 imm:$src2))))], NoItinerary>, EVEX_K;
@@ -1925,13 +1917,13 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                  X86VectorVTInfo _, string mem, string broadcast>{
   def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
                       (ins _.RC:$src1, i32u8imm:$src2),
-                      OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst | $dst, $src1, $src2}",
+                      OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
                                        (i32 imm:$src2)))], NoItinerary>;
   def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
                       (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix#
-                      "\t{$src2, $src1, $dst {${mask}}| $dst {${mask}}, $src1, $src2}",
+                      "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
                       [(set _.KRC:$dst,(or _.KRCWM:$mask, 
                                        (OpNode (_.VT _.RC:$src1),
                                        (i32 imm:$src2))))], NoItinerary>, EVEX_K;
@@ -1939,21 +1931,21 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
     def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                       (ins _.MemOp:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix##mem#
-                      "\t{$src2, $src1, $dst | $dst, $src1, $src2}",
+                      "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set _.KRC:$dst,(OpNode 
                                        (_.VT (bitconvert (_.LdFrag addr:$src1))),
                                        (i32 imm:$src2)))], NoItinerary>;
     def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                       (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix##mem#
-                      "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}",
+                      "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
                       [(set _.KRC:$dst, (or _.KRCWM:$mask, (OpNode 
                                     (_.VT (bitconvert (_.LdFrag addr:$src1))),
                                     (i32 imm:$src2))))], NoItinerary>, EVEX_K;
     def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                       (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
-                                        _.BroadcastStr##", $dst | $dst, ${src1}"
+                                        _.BroadcastStr##", $dst|$dst, ${src1}"
                                                     ##_.BroadcastStr##", $src2}",
                       [(set _.KRC:$dst,(OpNode 
                                        (_.VT (X86VBroadcast 
@@ -1962,7 +1954,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
     def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                       (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
-                            _.BroadcastStr##", $dst {${mask}} | $dst {${mask}}, ${src1}"##
+                            _.BroadcastStr##", $dst {${mask}}|$dst {${mask}}, ${src1}"##
                                                      _.BroadcastStr##", $src2}",
                       [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode 
                                        (_.VT (X86VBroadcast 
@@ -2715,30 +2707,6 @@ defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, 0>,
                avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>,
                PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
-def: Pat<(v8f64 (int_x86_avx512_mask_loadu_pd_512 addr:$ptr,
-                (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)),
-       (VMOVUPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
-
-def: Pat<(v16f32 (int_x86_avx512_mask_loadu_ps_512 addr:$ptr,
-                 (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)),
-       (VMOVUPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
-
-def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr,
-                (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)),
-       (VMOVAPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
-
-def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr,
-                 (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)),
-       (VMOVAPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
-
-def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr,
-                (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
-       (VMOVAPDZrm addr:$ptr)>;
-
-def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr,
-                 (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))),
-       (VMOVAPSZrm addr:$ptr)>;
-
 def: Pat<(int_x86_avx512_mask_storeu_ps_512 addr:$ptr, (v16f32 VR512:$src),
           GR16:$mask),
          (VMOVUPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
@@ -4088,8 +4056,8 @@ defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli>,
 defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai>,
              avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai>, AVX512BIi8Base, EVEX_4V;
 
-defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", rotr>, AVX512BIi8Base, EVEX_4V;
-defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", rotl>, AVX512BIi8Base, EVEX_4V;
+defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", X86vrotri>, AVX512BIi8Base, EVEX_4V;
+defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli>, AVX512BIi8Base, EVEX_4V;
 
 defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl>;
 defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra>;
@@ -6057,12 +6025,12 @@ multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
   let mayStore = 1 in {
     def mr : AVX512XS8I<opc, MRMDestMem, (outs),
                (ins x86memop:$dst, SrcInfo.RC:$src),
-               OpcodeStr # "\t{$src, $dst |$dst, $src}",
+               OpcodeStr # "\t{$src, $dst|$dst, $src}",
                []>, EVEX;
 
     def mrk : AVX512XS8I<opc, MRMDestMem, (outs),
                (ins x86memop:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src),
-               OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}",
+               OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
                []>, EVEX, EVEX_K;
   }//mayStore = 1
 }
@@ -6666,12 +6634,12 @@ multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _,
   let mayStore = 1 in {
   def mr : AVX5128I<opc, MRMDestMem, (outs),
               (ins _.MemOp:$dst, _.RC:$src),
-              OpcodeStr # "\t{$src, $dst |$dst, $src}",
+              OpcodeStr # "\t{$src, $dst|$dst, $src}",
               []>, EVEX_CD8<_.EltSize, CD8VT1>;
 
   def mrk : AVX5128I<opc, MRMDestMem, (outs),
               (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
-              OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}",
+              OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
               [(store (_.VT (vselect _.KRCWM:$mask,
                              (_.VT (X86compress  _.RC:$src)), _.ImmAllZerosV)),
                 addr:$dst)]>,
@@ -6766,7 +6734,7 @@ multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
                                              SDNode OpNode, X86VectorVTInfo _>{
   defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src1, i32u8imm:$src2),
-                      OpcodeStr##_.Suffix, "$src2,{sae}, $src1",
+                      OpcodeStr##_.Suffix, "$src2, {sae}, $src1",
                       "$src1, {sae}, $src2",
                       (OpNode (_.VT _.RC:$src1),
                               (i32 imm:$src2),
@@ -6895,8 +6863,8 @@ multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
                                              SDNode OpNode, X86VectorVTInfo _>{
   defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
-                      OpcodeStr, "$src3,{sae}, $src2, $src1",
-                      "$src1, $src2,{sae}, $src3",
+                      OpcodeStr, "$src3, {sae}, $src2, $src1",
+                      "$src1, $src2, {sae}, $src3",
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT _.RC:$src2),
                               (i32 imm:$src3),
@@ -6907,8 +6875,8 @@ multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr,
                                              SDNode OpNode, X86VectorVTInfo _> {
   defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
-                      OpcodeStr, "$src3,{sae}, $src2, $src1",
-                      "$src1, $src2,{sae}, $src3",
+                      OpcodeStr, "$src3, {sae}, $src2, $src1",
+                      "$src1, $src2, {sae}, $src3",
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT _.RC:$src2),
                               (i32 imm:$src3),
diff --git a/contrib/llvm/lib/Target/X86/X86InstrExtension.td b/contrib/llvm/lib/Target/X86/X86InstrExtension.td
index c4b2d6d3bb75..af43d9f53325 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrExtension.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrExtension.td
@@ -98,22 +98,22 @@ let hasSideEffects = 0, isCodeGenOnly = 1 in {
 def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
                          (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
                          "movz{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
-                         [], IIC_MOVZX>, TB, Sched<[WriteALU]>;
+                         [], IIC_MOVZX>, TB, OpSize32, Sched<[WriteALU]>;
 let mayLoad = 1 in
 def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
                          (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
                          "movz{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
-                         [], IIC_MOVZX>, TB, Sched<[WriteALULd]>;
+                         [], IIC_MOVZX>, TB, OpSize32, Sched<[WriteALULd]>;
 
 def MOVSX32_NOREXrr8 : I<0xBE, MRMSrcReg,
                          (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
                          "movs{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
-                         [], IIC_MOVSX>, TB, Sched<[WriteALU]>;
+                         [], IIC_MOVSX>, TB, OpSize32, Sched<[WriteALU]>;
 let mayLoad = 1 in
 def MOVSX32_NOREXrm8 : I<0xBE, MRMSrcMem,
                          (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
                          "movs{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
-                         [], IIC_MOVSX>, TB, Sched<[WriteALULd]>;
+                         [], IIC_MOVSX>, TB, OpSize32, Sched<[WriteALULd]>;
 }
 
 // MOVSX64rr8 always has a REX prefix and it has an 8-bit register
@@ -146,18 +146,22 @@ def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
                     Sched<[WriteALULd]>, Requires<[In64BitMode]>;
 
 // movzbq and movzwq encodings for the disassembler
-def MOVZX64rr8_Q : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src),
-                       "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
-                       TB, Sched<[WriteALU]>;
-def MOVZX64rm8_Q : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src),
-                       "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
-                       TB, Sched<[WriteALULd]>;
-def MOVZX64rr16_Q : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
-                       "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
-                       TB, Sched<[WriteALU]>;
-def MOVZX64rm16_Q : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
-                       "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
-                       TB, Sched<[WriteALULd]>;
+let hasSideEffects = 0 in {
+def MOVZX64rr8 : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src),
+                     "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                     TB, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX64rm8 : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src),
+                     "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                     TB, Sched<[WriteALULd]>;
+def MOVZX64rr16 : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+                     "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                     TB, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX64rm16 : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+                     "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                     TB, Sched<[WriteALULd]>;
+}
 
 // 64-bit zero-extension patterns use SUBREG_TO_REG and an operation writing a
 // 32-bit register.
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 829cedd55fb3..643286324e25 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -225,6 +225,9 @@ def X86vshli   : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>;
 def X86vsrli   : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>;
 def X86vsrai   : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>;
 
+def X86vrotli  : SDNode<"X86ISD::VROTLI", SDTIntShiftOp>;
+def X86vrotri  : SDNode<"X86ISD::VROTRI", SDTIntShiftOp>;
+
 def X86vprot   : SDNode<"X86ISD::VPROT",
                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                              SDTCisSameAs<0,2>]>>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
index ea8e56206ce6..9c8339a841c9 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
@@ -1273,7 +1273,7 @@ def STOSW : I<0xAB, RawFrmDst, (outs dstidx16:$dst), (ins),
 let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in
 def STOSL : I<0xAB, RawFrmDst, (outs dstidx32:$dst), (ins),
               "stos{l|d}\t{%eax, $dst|$dst, eax}", [], IIC_STOS>, OpSize32;
-let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI,EFLAGS] in
+let Defs = [RDI], Uses = [RAX,RDI,EFLAGS] in
 def STOSQ : RI<0xAB, RawFrmDst, (outs dstidx64:$dst), (ins),
                "stosq\t{%rax, $dst|$dst, rax}", [], IIC_STOS>;
 
@@ -2755,56 +2755,56 @@ def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>;
 
 // Disambiguate the mem/imm form of bt-without-a-suffix as btl.
 // Likewise for btc/btr/bts.
-def : InstAlias<"bt {$imm, $mem|$mem, $imm}",
+def : InstAlias<"bt\t{$imm, $mem|$mem, $imm}",
                 (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
-def : InstAlias<"btc {$imm, $mem|$mem, $imm}",
+def : InstAlias<"btc\t{$imm, $mem|$mem, $imm}",
                 (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
-def : InstAlias<"btr {$imm, $mem|$mem, $imm}",
+def : InstAlias<"btr\t{$imm, $mem|$mem, $imm}",
                 (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
-def : InstAlias<"bts {$imm, $mem|$mem, $imm}",
+def : InstAlias<"bts\t{$imm, $mem|$mem, $imm}",
                 (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
 
 // clr aliases.
-def : InstAlias<"clrb $reg", (XOR8rr  GR8 :$reg, GR8 :$reg), 0>;
-def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg), 0>;
-def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg), 0>;
-def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg), 0>;
+def : InstAlias<"clrb\t$reg", (XOR8rr  GR8 :$reg, GR8 :$reg), 0>;
+def : InstAlias<"clrw\t$reg", (XOR16rr GR16:$reg, GR16:$reg), 0>;
+def : InstAlias<"clrl\t$reg", (XOR32rr GR32:$reg, GR32:$reg), 0>;
+def : InstAlias<"clrq\t$reg", (XOR64rr GR64:$reg, GR64:$reg), 0>;
 
 // lods aliases. Accept the destination being omitted because it's implicit
 // in the mnemonic, or the mnemonic suffix being omitted because it's implicit
 // in the destination.
-def : InstAlias<"lodsb $src", (LODSB srcidx8:$src),  0>;
-def : InstAlias<"lodsw $src", (LODSW srcidx16:$src), 0>;
-def : InstAlias<"lods{l|d} $src", (LODSL srcidx32:$src), 0>;
-def : InstAlias<"lodsq $src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"lods {$src, %al|al, $src}", (LODSB srcidx8:$src),  0>;
-def : InstAlias<"lods {$src, %ax|ax, $src}", (LODSW srcidx16:$src), 0>;
-def : InstAlias<"lods {$src, %eax|eax, $src}", (LODSL srcidx32:$src), 0>;
-def : InstAlias<"lods {$src, %rax|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"lodsb\t$src", (LODSB srcidx8:$src),  0>;
+def : InstAlias<"lodsw\t$src", (LODSW srcidx16:$src), 0>;
+def : InstAlias<"lods{l|d}\t$src", (LODSL srcidx32:$src), 0>;
+def : InstAlias<"lodsq\t$src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"lods\t{$src, %al|al, $src}", (LODSB srcidx8:$src),  0>;
+def : InstAlias<"lods\t{$src, %ax|ax, $src}", (LODSW srcidx16:$src), 0>;
+def : InstAlias<"lods\t{$src, %eax|eax, $src}", (LODSL srcidx32:$src), 0>;
+def : InstAlias<"lods\t{$src, %rax|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
 
 // stos aliases. Accept the source being omitted because it's implicit in
 // the mnemonic, or the mnemonic suffix being omitted because it's implicit
 // in the source.
-def : InstAlias<"stosb $dst", (STOSB dstidx8:$dst),  0>;
-def : InstAlias<"stosw $dst", (STOSW dstidx16:$dst), 0>;
-def : InstAlias<"stos{l|d} $dst", (STOSL dstidx32:$dst), 0>;
-def : InstAlias<"stosq $dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"stos {%al, $dst|$dst, al}", (STOSB dstidx8:$dst),  0>;
-def : InstAlias<"stos {%ax, $dst|$dst, ax}", (STOSW dstidx16:$dst), 0>;
-def : InstAlias<"stos {%eax, $dst|$dst, eax}", (STOSL dstidx32:$dst), 0>;
-def : InstAlias<"stos {%rax, $dst|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"stosb\t$dst", (STOSB dstidx8:$dst),  0>;
+def : InstAlias<"stosw\t$dst", (STOSW dstidx16:$dst), 0>;
+def : InstAlias<"stos{l|d}\t$dst", (STOSL dstidx32:$dst), 0>;
+def : InstAlias<"stosq\t$dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"stos\t{%al, $dst|$dst, al}", (STOSB dstidx8:$dst),  0>;
+def : InstAlias<"stos\t{%ax, $dst|$dst, ax}", (STOSW dstidx16:$dst), 0>;
+def : InstAlias<"stos\t{%eax, $dst|$dst, eax}", (STOSL dstidx32:$dst), 0>;
+def : InstAlias<"stos\t{%rax, $dst|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
 
 // scas aliases. Accept the destination being omitted because it's implicit
 // in the mnemonic, or the mnemonic suffix being omitted because it's implicit
 // in the destination.
-def : InstAlias<"scasb $dst", (SCASB dstidx8:$dst),  0>;
-def : InstAlias<"scasw $dst", (SCASW dstidx16:$dst), 0>;
-def : InstAlias<"scas{l|d} $dst", (SCASL dstidx32:$dst), 0>;
-def : InstAlias<"scasq $dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"scas {$dst, %al|al, $dst}", (SCASB dstidx8:$dst),  0>;
-def : InstAlias<"scas {$dst, %ax|ax, $dst}", (SCASW dstidx16:$dst), 0>;
-def : InstAlias<"scas {$dst, %eax|eax, $dst}", (SCASL dstidx32:$dst), 0>;
-def : InstAlias<"scas {$dst, %rax|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"scasb\t$dst", (SCASB dstidx8:$dst),  0>;
+def : InstAlias<"scasw\t$dst", (SCASW dstidx16:$dst), 0>;
+def : InstAlias<"scas{l|d}\t$dst", (SCASL dstidx32:$dst), 0>;
+def : InstAlias<"scasq\t$dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"scas\t{$dst, %al|al, $dst}", (SCASB dstidx8:$dst),  0>;
+def : InstAlias<"scas\t{$dst, %ax|ax, $dst}", (SCASW dstidx16:$dst), 0>;
+def : InstAlias<"scas\t{$dst, %eax|eax, $dst}", (SCASL dstidx32:$dst), 0>;
+def : InstAlias<"scas\t{$dst, %rax|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
 
 // div and idiv aliases for explicit A register.
 def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r  GR8 :$src)>;
@@ -2892,30 +2892,30 @@ def : InstAlias<"fnstsw"     , (FNSTSW16r)>;
 
 // lcall and ljmp aliases.  This seems to be an odd mapping in 64-bit mode, but
 // this is compatible with what GAS does.
-def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
-def : InstAlias<"ljmp $seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
-def : InstAlias<"lcall {*}$dst",    (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
-def : InstAlias<"ljmp {*}$dst",     (FARJMP32m  opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
-def : InstAlias<"lcall $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
-def : InstAlias<"ljmp $seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
-def : InstAlias<"lcall {*}$dst",    (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
-def : InstAlias<"ljmp {*}$dst",     (FARJMP16m  opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
-
-def : InstAlias<"call {*}$dst",     (CALL64m i64mem:$dst), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"jmp {*}$dst",      (JMP64m  i64mem:$dst), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"call {*}$dst",     (CALL32m i32mem:$dst), 0>, Requires<[In32BitMode]>;
-def : InstAlias<"jmp {*}$dst",      (JMP32m  i32mem:$dst), 0>, Requires<[In32BitMode]>;
-def : InstAlias<"call {*}$dst",     (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>;
-def : InstAlias<"jmp {*}$dst",      (JMP16m  i16mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"lcall\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp\t$seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall\t{*}$dst",    (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp\t{*}$dst",     (FARJMP32m  opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp\t$seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"lcall\t{*}$dst",    (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp\t{*}$dst",     (FARJMP16m  opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+
+def : InstAlias<"call\t{*}$dst",     (CALL64m i64mem:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"jmp\t{*}$dst",      (JMP64m  i64mem:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"call\t{*}$dst",     (CALL32m i32mem:$dst), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"jmp\t{*}$dst",      (JMP32m  i32mem:$dst), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"call\t{*}$dst",     (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"jmp\t{*}$dst",      (JMP16m  i16mem:$dst), 0>, Requires<[In16BitMode]>;
 
 
 // "imul <imm>, B" is an alias for "imul <imm>, B, B".
-def : InstAlias<"imul{w} {$imm, $r|$r, $imm}", (IMUL16rri  GR16:$r, GR16:$r, i16imm:$imm), 0>;
-def : InstAlias<"imul{w} {$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>;
-def : InstAlias<"imul{l} {$imm, $r|$r, $imm}", (IMUL32rri  GR32:$r, GR32:$r, i32imm:$imm), 0>;
-def : InstAlias<"imul{l} {$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>;
-def : InstAlias<"imul{q} {$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>;
-def : InstAlias<"imul{q} {$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>;
+def : InstAlias<"imul{w}\t{$imm, $r|$r, $imm}", (IMUL16rri  GR16:$r, GR16:$r, i16imm:$imm), 0>;
+def : InstAlias<"imul{w}\t{$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>;
+def : InstAlias<"imul{l}\t{$imm, $r|$r, $imm}", (IMUL32rri  GR32:$r, GR32:$r, i32imm:$imm), 0>;
+def : InstAlias<"imul{l}\t{$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>;
+def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>;
+def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>;
 
 // inb %dx -> inb %al, %dx
 def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>;
@@ -2927,46 +2927,46 @@ def : InstAlias<"inl\t$port", (IN32ri u8imm:$port), 0>;
 
 
 // jmp and call aliases for lcall and ljmp.  jmp $42,$5 -> ljmp
-def : InstAlias<"call $seg, $off",  (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
-def : InstAlias<"jmp $seg, $off",   (FARJMP16i  i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
-def : InstAlias<"call $seg, $off",  (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
-def : InstAlias<"jmp $seg, $off",   (FARJMP32i  i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
-def : InstAlias<"callw $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>;
-def : InstAlias<"jmpw $seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg)>;
-def : InstAlias<"calll $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>;
-def : InstAlias<"jmpl $seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg)>;
+def : InstAlias<"call\t$seg, $off",  (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
+def : InstAlias<"jmp\t$seg, $off",   (FARJMP16i  i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
+def : InstAlias<"call\t$seg, $off",  (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
+def : InstAlias<"jmp\t$seg, $off",   (FARJMP32i  i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
+def : InstAlias<"callw\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>;
+def : InstAlias<"jmpw\t$seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg)>;
+def : InstAlias<"calll\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>;
+def : InstAlias<"jmpl\t$seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg)>;
 
 // Force mov without a suffix with a segment and mem to prefer the 'l' form of
 // the move.  All segment/mem forms are equivalent, this has the shortest
 // encoding.
-def : InstAlias<"mov {$mem, $seg|$seg, $mem}", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>;
-def : InstAlias<"mov {$seg, $mem|$mem, $seg}", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>;
+def : InstAlias<"mov\t{$mem, $seg|$seg, $mem}", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>;
+def : InstAlias<"mov\t{$seg, $mem|$mem, $seg}", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>;
 
 // Match 'movq <largeimm>, <reg>' as an alias for movabsq.
-def : InstAlias<"movq {$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>;
+def : InstAlias<"movq\t{$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>;
 
 // Match 'movq GR64, MMX' as an alias for movd.
-def : InstAlias<"movq {$src, $dst|$dst, $src}",
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
                 (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>;
-def : InstAlias<"movq {$src, $dst|$dst, $src}",
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
                 (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>;
 
 // movsx aliases
-def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>;
-def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>;
-def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>;
-def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>;
-def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>;
-def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>;
-def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>;
 
 // movzx aliases
-def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>;
-def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>;
-def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>;
-def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>;
-def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>;
-def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr8 GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr16 GR64:$dst, GR16:$src), 0>;
 // Note: No GR32->GR64 movzx form.
 
 // outb %dx -> outb %al, %dx
diff --git a/contrib/llvm/lib/Target/X86/X86InstrMPX.td b/contrib/llvm/lib/Target/X86/X86InstrMPX.td
index 31608cd4c128..71ab97374dd6 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrMPX.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrMPX.td
@@ -15,10 +15,10 @@
 
 multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> {
   def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins i32mem:$src),
-              OpcodeStr#" \t{$src, $dst|$dst, $src}", []>,
+              OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
               Requires<[HasMPX, Not64BitMode]>;
   def 64rm: RI<opc, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
-              OpcodeStr#" \t{$src, $dst|$dst, $src}", []>,
+              OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
               Requires<[HasMPX, In64BitMode]>;
 }
 
@@ -26,16 +26,16 @@ defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS;
 
 multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> {
   def 32rm: I<opc, MRMSrcMem, (outs), (ins  BNDR:$src1, i32mem:$src2),
-              OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>,
+              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
               Requires<[HasMPX, Not64BitMode]>;
   def 64rm: RI<opc, MRMSrcMem, (outs), (ins  BNDR:$src1, i64mem:$src2),
-              OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>,
+              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
               Requires<[HasMPX, In64BitMode]>;
   def 32rr: I<opc, MRMSrcReg, (outs), (ins  BNDR:$src1, GR32:$src2),
-              OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>,
+              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
               Requires<[HasMPX, Not64BitMode]>;
   def 64rr: RI<opc, MRMSrcReg, (outs), (ins  BNDR:$src1, GR64:$src2),
-              OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>,
+              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
               Requires<[HasMPX, In64BitMode]>;
 }
 defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS;
@@ -43,28 +43,28 @@ defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD;
 defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD;
 
 def BNDMOVRMrr   : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src),
-                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX]>;
 def BNDMOVRM32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
-                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX, Not64BitMode]>;
 def BNDMOVRM64rm : RI<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src),
-                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX, In64BitMode]>;
 
 def BNDMOVMRrr   : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src),
-                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX]>;
 def BNDMOVMR32mr : I<0x1B, MRMDestMem, (outs i64mem:$dst), (ins BNDR:$src),
-                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX, Not64BitMode]>;
 def BNDMOVMR64mr : RI<0x1B, MRMDestMem, (outs i128mem:$dst), (ins BNDR:$src),
-                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX, In64BitMode]>;
 
 def BNDSTXmr:      I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
-                    "bndstx \t{$src, $dst|$dst, $src}", []>, PS,
+                    "bndstx\t{$src, $dst|$dst, $src}", []>, PS,
                     Requires<[HasMPX]>;
 def BNDLDXrm:      I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
-                    "bndldx \t{$src, $dst|$dst, $src}", []>, PS,
+                    "bndldx\t{$src, $dst|$dst, $src}", []>, PS,
                     Requires<[HasMPX]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
index 624b9316e6fd..6a7c45665e9c 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
@@ -1808,7 +1808,7 @@ def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
                 (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
-                (CVTSD2SI64rm GR64:$dst, sdmem:$src)>;
+                (CVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
 
 /// SSE 2 Only
 
@@ -7838,9 +7838,7 @@ class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
   AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
         [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
-        Sched<[Sched]>, VEX {
-    let mayLoad = 1;
-}
+        Sched<[Sched]>, VEX;
 
 // AVX2 adds register forms
 class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
@@ -7871,7 +7869,7 @@ let ExeDomain = SSEPackedDouble in
 def VBROADCASTSDYrr  : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
                                          v4f64, v2f64, WriteFShuffle256>, VEX_L;
 
-let mayLoad = 1, Predicates = [HasAVX2] in
+let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
 def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
                            (ins i128mem:$src),
                            "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
@@ -8259,6 +8257,9 @@ let Predicates = [HasF16C] in {
             (VCVTPH2PSrm addr:$src)>;
   def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)),
             (VCVTPH2PSrm addr:$src)>;
+  def : Pat<(int_x86_vcvtph2ps_128 (bitconvert
+              (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+            (VCVTPH2PSrm addr:$src)>;
 
   def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16
                   (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
diff --git a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 646b556faa8f..b525d5eb60a7 100644
--- a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -29,7 +29,7 @@ enum IntrinsicType {
   INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM,
   COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC,
   TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
-  EXPAND_FROM_MEM, BLEND, INSERT_SUBVEC,
+  EXPAND_FROM_MEM, LOADA, LOADU, BLEND, INSERT_SUBVEC,
   TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK
 };
 
@@ -143,6 +143,18 @@ static const IntrinsicData IntrinsicsWithChain[] = {
                      EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
   X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512,
                      EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_load_pd_128, LOADA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_load_pd_256, LOADA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_load_pd_512, LOADA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_load_ps_128, LOADA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_load_ps_256, LOADA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_load_ps_512, LOADA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_loadu_pd_128, LOADU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_loadu_pd_256, LOADU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_loadu_pd_512, LOADU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_loadu_ps_128, LOADU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_loadu_ps_256, LOADU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_loadu_ps_512, LOADU, ISD::DELETED_NODE, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
                      X86ISD::VTRUNC, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8,
@@ -1129,6 +1141,42 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::VTRUNCS, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_512, INTR_TYPE_1OP_MASK,
                      X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxb_d_128,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxb_d_256,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxb_d_512,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxb_q_128,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxb_q_256,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxb_q_512,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxb_w_128,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxb_w_256,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxb_w_512,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxd_q_128,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxd_q_256,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxd_q_512,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxw_d_128,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxw_d_256,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxw_d_512,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxw_q_128,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxw_q_256,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovsxw_q_512,  INTR_TYPE_1OP_MASK,
+                     X86ISD::VSEXT, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmovus_db_128, INTR_TYPE_1OP_MASK,
                      X86ISD::VTRUNCUS, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmovus_db_256, INTR_TYPE_1OP_MASK,
@@ -1165,6 +1213,42 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::VTRUNCUS, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK,
                      X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxb_d_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxb_d_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxb_d_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxb_q_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxb_q_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxb_q_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxb_w_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxb_w_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxb_w_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxd_q_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxd_q_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxd_q_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxw_d_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxw_d_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxw_d_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxw_q_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxw_q_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovzxw_q_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VZEXT, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmul_dq_128, INTR_TYPE_2OP_MASK,
                      X86ISD::PMULDQ, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmul_dq_256, INTR_TYPE_2OP_MASK,
@@ -1201,12 +1285,54 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_por_q_128, INTR_TYPE_2OP_MASK, ISD::OR, 0),
   X86_INTRINSIC_DATA(avx512_mask_por_q_256, INTR_TYPE_2OP_MASK, ISD::OR, 0),
   X86_INTRINSIC_DATA(avx512_mask_por_q_512, INTR_TYPE_2OP_MASK, ISD::OR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_d_128,  INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_d_256,  INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_d_512,  INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_q_128,  INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_q_256,  INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_q_512,  INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prolv_d_128, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prolv_d_256, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prolv_d_512, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prolv_q_128, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prolv_q_256, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prolv_q_512, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pror_d_128,  INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pror_d_256,  INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pror_d_512,  INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pror_q_128,  INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pror_q_256,  INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pror_q_512,  INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0),  
+  X86_INTRINSIC_DATA(avx512_mask_prorv_d_128, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prorv_d_256, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prorv_d_512, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prorv_q_128, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prorv_q_256, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prorv_q_512, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
   X86_INTRINSIC_DATA(avx512_mask_pshuf_b_128, INTR_TYPE_2OP_MASK,
                      X86ISD::PSHUFB, 0),
   X86_INTRINSIC_DATA(avx512_mask_pshuf_b_256, INTR_TYPE_2OP_MASK,
                     X86ISD::PSHUFB, 0),
   X86_INTRINSIC_DATA(avx512_mask_pshuf_b_512, INTR_TYPE_2OP_MASK,
                     X86ISD::PSHUFB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pshuf_d_128, INTR_TYPE_2OP_MASK, 
+                    X86ISD::PSHUFD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pshuf_d_256, INTR_TYPE_2OP_MASK, 
+                    X86ISD::PSHUFD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pshuf_d_512, INTR_TYPE_2OP_MASK, 
+                    X86ISD::PSHUFD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pshufh_w_128,  INTR_TYPE_2OP_MASK,
+                    X86ISD::PSHUFHW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pshufh_w_256,  INTR_TYPE_2OP_MASK,
+                    X86ISD::PSHUFHW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pshufh_w_512,  INTR_TYPE_2OP_MASK,
+                    X86ISD::PSHUFHW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pshufl_w_128,  INTR_TYPE_2OP_MASK,
+                    X86ISD::PSHUFLW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pshufl_w_256,  INTR_TYPE_2OP_MASK,
+                    X86ISD::PSHUFLW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pshufl_w_512,  INTR_TYPE_2OP_MASK, 
+                    X86ISD::PSHUFLW, 0),
   X86_INTRINSIC_DATA(avx512_mask_psll_d,        INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psll_d_128,    INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psll_d_256,    INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
@@ -1219,8 +1345,21 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_psll_qi_128,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
   X86_INTRINSIC_DATA(avx512_mask_psll_qi_256,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
   X86_INTRINSIC_DATA(avx512_mask_psll_qi_512,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_w_128,    INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_w_256,    INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_w_512,    INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_wi_128,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_wi_256,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_wi_512,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
   X86_INTRINSIC_DATA(avx512_mask_pslli_d,       VSHIFT_MASK, X86ISD::VSHLI, 0),
   X86_INTRINSIC_DATA(avx512_mask_pslli_q,       VSHIFT_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psllv16_hi,    INTR_TYPE_2OP_MASK, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psllv2_di,     INTR_TYPE_2OP_MASK, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psllv32hi,     INTR_TYPE_2OP_MASK, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psllv4_di,     INTR_TYPE_2OP_MASK, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psllv4_si,     INTR_TYPE_2OP_MASK, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psllv8_hi,     INTR_TYPE_2OP_MASK, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psllv8_si,     INTR_TYPE_2OP_MASK, ISD::SHL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psllv_d,       INTR_TYPE_2OP_MASK, ISD::SHL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psllv_q,       INTR_TYPE_2OP_MASK, ISD::SHL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psra_d,        INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
@@ -1243,8 +1382,15 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_psra_wi_512,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrai_d,       VSHIFT_MASK, X86ISD::VSRAI, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrai_q,       VSHIFT_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav16_hi,    INTR_TYPE_2OP_MASK, ISD::SRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav32_hi,     INTR_TYPE_2OP_MASK, ISD::SRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav4_si,     INTR_TYPE_2OP_MASK, ISD::SRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav8_hi,     INTR_TYPE_2OP_MASK, ISD::SRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav8_si,     INTR_TYPE_2OP_MASK, ISD::SRA, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrav_d,       INTR_TYPE_2OP_MASK, ISD::SRA, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrav_q,       INTR_TYPE_2OP_MASK, ISD::SRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav_q_128,   INTR_TYPE_2OP_MASK, ISD::SRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav_q_256,   INTR_TYPE_2OP_MASK, ISD::SRA, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrl_d,        INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrl_d_128,    INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
   X86_INTRINSIC_DATA(avx512_mask_psrl_d_256,    INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index 3a7a98db50f4..00515dde5568 100644
--- a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -92,6 +92,10 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   /// used to address arguments in a function using a base pointer.
   int SEHFramePtrSaveIndex = 0;
 
+  /// True if this function has a subset of CSRs that is handled explicitly via
+  /// copies.
+  bool IsSplitCSR = false;
+
 private:
   /// ForwardedMustTailRegParms - A list of virtual and physical registers
   /// that must be forwarded to every musttail call.
@@ -160,6 +164,9 @@ public:
   SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() {
     return ForwardedMustTailRegParms;
   }
+
+  bool isSplitCSR() const { return IsSplitCSR; }
+  void setIsSplitCSR(bool s) { IsSplitCSR = s; }
 };
 
 } // End llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
index 58020d909a43..45cc0aef1d93 100644
--- a/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -9,8 +9,10 @@
 //
 // This file defines the pass that performs some optimizations with LEA
 // instructions in order to improve code size.
-// Currently, it does one thing:
-// 1) Address calculations in load and store instructions are replaced by
+// Currently, it does two things:
+// 1) If there are two LEA instructions calculating addresses which only differ
+//    by displacement inside a basic block, one of them is removed.
+// 2) Address calculations in load and store instructions are replaced by
 //    existing LEA def registers where possible.
 //
 //===----------------------------------------------------------------------===//
@@ -38,6 +40,7 @@ static cl::opt<bool> EnableX86LEAOpt("enable-x86-lea-opt", cl::Hidden,
                                      cl::init(false));
 
 STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions");
+STATISTIC(NumRedundantLEAs, "Number of redundant LEA instructions removed");
 
 namespace {
 class OptimizeLEAPass : public MachineFunctionPass {
@@ -71,6 +74,13 @@ private:
   /// \brief Returns true if the instruction is LEA.
   bool isLEA(const MachineInstr &MI);
 
+  /// \brief Returns true if the \p Last LEA instruction can be replaced by the
+  /// \p First. The difference between displacements of the addresses calculated
+  /// by these LEAs is returned in \p AddrDispShift. It'll be used for proper
+  /// replacement of the \p Last LEA's uses with the \p First's def register.
+  bool isReplaceable(const MachineInstr &First, const MachineInstr &Last,
+                     int64_t &AddrDispShift);
+
   /// \brief Returns true if two instructions have memory operands that only
   /// differ by displacement. The numbers of the first memory operands for both
   /// instructions are specified through \p N1 and \p N2. The address
@@ -79,13 +89,20 @@ private:
                       const MachineInstr &MI2, unsigned N2,
                       int64_t &AddrDispShift);
 
-  /// \brief Find all LEA instructions in the basic block.
+  /// \brief Find all LEA instructions in the basic block. Also, assign position
+  /// numbers to all instructions in the basic block to speed up calculation of
+  /// distance between them.
   void findLEAs(const MachineBasicBlock &MBB,
                 SmallVectorImpl<MachineInstr *> &List);
 
   /// \brief Removes redundant address calculations.
   bool removeRedundantAddrCalc(const SmallVectorImpl<MachineInstr *> &List);
 
+  /// \brief Removes LEAs which calculate similar addresses.
+  bool removeRedundantLEAs(SmallVectorImpl<MachineInstr *> &List);
+
+  DenseMap<const MachineInstr *, unsigned> InstrPos;
+
   MachineRegisterInfo *MRI;
   const X86InstrInfo *TII;
   const X86RegisterInfo *TRI;
@@ -99,14 +116,15 @@ FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); }
 
 int OptimizeLEAPass::calcInstrDist(const MachineInstr &First,
                                    const MachineInstr &Last) {
-  const MachineBasicBlock *MBB = First.getParent();
-
-  // Both instructions must be in the same basic block.
-  assert(Last.getParent() == MBB &&
+  // Both instructions must be in the same basic block and they must be
+  // presented in InstrPos.
+  assert(Last.getParent() == First.getParent() &&
          "Instructions are in different basic blocks");
+  assert(InstrPos.find(&First) != InstrPos.end() &&
+         InstrPos.find(&Last) != InstrPos.end() &&
+         "Instructions' positions are undefined");
 
-  return std::distance(MBB->begin(), MachineBasicBlock::const_iterator(&Last)) -
-         std::distance(MBB->begin(), MachineBasicBlock::const_iterator(&First));
+  return InstrPos[&Last] - InstrPos[&First];
 }
 
 // Find the best LEA instruction in the List to replace address recalculation in
@@ -189,6 +207,69 @@ bool OptimizeLEAPass::isLEA(const MachineInstr &MI) {
          Opcode == X86::LEA64r || Opcode == X86::LEA64_32r;
 }
 
+// Check that the Last LEA can be replaced by the First LEA. To be so,
+// these requirements must be met:
+// 1) Addresses calculated by LEAs differ only by displacement.
+// 2) Def registers of LEAs belong to the same class.
+// 3) All uses of the Last LEA def register are replaceable, thus the
+//    register is used only as address base.
+bool OptimizeLEAPass::isReplaceable(const MachineInstr &First,
+                                    const MachineInstr &Last,
+                                    int64_t &AddrDispShift) {
+  assert(isLEA(First) && isLEA(Last) &&
+         "The function works only with LEA instructions");
+
+  // Compare instructions' memory operands.
+  if (!isSimilarMemOp(Last, 1, First, 1, AddrDispShift))
+    return false;
+
+  // Make sure that LEA def registers belong to the same class. There may be
+  // instructions (like MOV8mr_NOREX) which allow a limited set of registers to
+  // be used as their operands, so we must be sure that replacing one LEA
+  // with another won't lead to putting a wrong register in the instruction.
+  if (MRI->getRegClass(First.getOperand(0).getReg()) !=
+      MRI->getRegClass(Last.getOperand(0).getReg()))
+    return false;
+
+  // Loop over all uses of the Last LEA to check that its def register is
+  // used only as address base for memory accesses. If so, it can be
+  // replaced, otherwise - no.
+  for (auto &MO : MRI->use_operands(Last.getOperand(0).getReg())) {
+    MachineInstr &MI = *MO.getParent();
+
+    // Get the number of the first memory operand.
+    const MCInstrDesc &Desc = MI.getDesc();
+    int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, MI.getOpcode());
+
+    // If the use instruction has no memory operand - the LEA is not
+    // replaceable.
+    if (MemOpNo < 0)
+      return false;
+
+    MemOpNo += X86II::getOperandBias(Desc);
+
+    // If the address base of the use instruction is not the LEA def register -
+    // the LEA is not replaceable.
+    if (!isIdenticalOp(MI.getOperand(MemOpNo + X86::AddrBaseReg), MO))
+      return false;
+
+    // If the LEA def register is used as any other operand of the use
+    // instruction - the LEA is not replaceable.
+    for (unsigned i = 0; i < MI.getNumOperands(); i++)
+      if (i != (unsigned)(MemOpNo + X86::AddrBaseReg) &&
+          isIdenticalOp(MI.getOperand(i), MO))
+        return false;
+
+    // Check that the new address displacement will fit 4 bytes.
+    if (MI.getOperand(MemOpNo + X86::AddrDisp).isImm() &&
+        !isInt<32>(MI.getOperand(MemOpNo + X86::AddrDisp).getImm() +
+                   AddrDispShift))
+      return false;
+  }
+
+  return true;
+}
+
 // Check if MI1 and MI2 have memory operands which represent addresses that
 // differ only by displacement.
 bool OptimizeLEAPass::isSimilarMemOp(const MachineInstr &MI1, unsigned N1,
@@ -219,7 +300,15 @@ bool OptimizeLEAPass::isSimilarMemOp(const MachineInstr &MI1, unsigned N1,
 
 void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB,
                                SmallVectorImpl<MachineInstr *> &List) {
+  unsigned Pos = 0;
   for (auto &MI : MBB) {
+    // Assign the position number to the instruction. Note that we are going to
+    // move some instructions during the optimization however there will never
+    // be a need to move two instructions before any selected instruction. So to
+    // avoid multiple positions' updates during moves we just increase position
+    // counter by two leaving a free space for instructions which will be moved.
+    InstrPos[&MI] = Pos += 2;
+
     if (isLEA(MI))
       List.push_back(const_cast<MachineInstr *>(&MI));
   }
@@ -270,6 +359,13 @@ bool OptimizeLEAPass::removeRedundantAddrCalc(
     if (Dist < 0) {
       DefMI->removeFromParent();
       MBB->insert(MachineBasicBlock::iterator(&MI), DefMI);
+      InstrPos[DefMI] = InstrPos[&MI] - 1;
+
+      // Make sure the instructions' position numbers are sane.
+      assert(((InstrPos[DefMI] == 1 && DefMI == MBB->begin()) ||
+              InstrPos[DefMI] >
+                  InstrPos[std::prev(MachineBasicBlock::iterator(DefMI))]) &&
+             "Instruction positioning is broken");
     }
 
     // Since we can possibly extend register lifetime, clear kill flags.
@@ -296,6 +392,81 @@ bool OptimizeLEAPass::removeRedundantAddrCalc(
   return Changed;
 }
 
+// Try to find similar LEAs in the list and replace one with another.
+bool
+OptimizeLEAPass::removeRedundantLEAs(SmallVectorImpl<MachineInstr *> &List) {
+  bool Changed = false;
+
+  // Loop over all LEA pairs.
+  auto I1 = List.begin();
+  while (I1 != List.end()) {
+    MachineInstr &First = **I1;
+    auto I2 = std::next(I1);
+    while (I2 != List.end()) {
+      MachineInstr &Last = **I2;
+      int64_t AddrDispShift;
+
+      // LEAs should be in occurence order in the list, so we can freely
+      // replace later LEAs with earlier ones.
+      assert(calcInstrDist(First, Last) > 0 &&
+             "LEAs must be in occurence order in the list");
+
+      // Check that the Last LEA instruction can be replaced by the First.
+      if (!isReplaceable(First, Last, AddrDispShift)) {
+        ++I2;
+        continue;
+      }
+
+      // Loop over all uses of the Last LEA and update their operands. Note that
+      // the correctness of this has already been checked in the isReplaceable
+      // function.
+      for (auto UI = MRI->use_begin(Last.getOperand(0).getReg()),
+                UE = MRI->use_end();
+           UI != UE;) {
+        MachineOperand &MO = *UI++;
+        MachineInstr &MI = *MO.getParent();
+
+        // Get the number of the first memory operand.
+        const MCInstrDesc &Desc = MI.getDesc();
+        int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, MI.getOpcode()) +
+                      X86II::getOperandBias(Desc);
+
+        // Update address base.
+        MO.setReg(First.getOperand(0).getReg());
+
+        // Update address disp.
+        MachineOperand *Op = &MI.getOperand(MemOpNo + X86::AddrDisp);
+        if (Op->isImm())
+          Op->setImm(Op->getImm() + AddrDispShift);
+        else if (Op->isGlobal())
+          Op->setOffset(Op->getOffset() + AddrDispShift);
+        else
+          llvm_unreachable("Invalid address displacement operand");
+      }
+
+      // Since we can possibly extend register lifetime, clear kill flags.
+      MRI->clearKillFlags(First.getOperand(0).getReg());
+
+      ++NumRedundantLEAs;
+      DEBUG(dbgs() << "OptimizeLEAs: Remove redundant LEA: "; Last.dump(););
+
+      // By this moment, all of the Last LEA's uses must be replaced. So we can
+      // freely remove it.
+      assert(MRI->use_empty(Last.getOperand(0).getReg()) &&
+             "The LEA's def register must have no uses");
+      Last.eraseFromParent();
+
+      // Erase removed LEA from the list.
+      I2 = List.erase(I2);
+
+      Changed = true;
+    }
+    ++I1;
+  }
+
+  return Changed;
+}
+
 bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
 
@@ -310,6 +481,7 @@ bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
   // Process all basic blocks.
   for (auto &MBB : MF) {
     SmallVector<MachineInstr *, 16> LEAs;
+    InstrPos.clear();
 
     // Find all LEA instructions in basic block.
     findLEAs(MBB, LEAs);
@@ -318,6 +490,11 @@ bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
     if (LEAs.empty())
       continue;
 
+    // Remove redundant LEA instructions. The optimization may have a negative
+    // effect on performance, so do it only for -Oz.
+    if (MF.getFunction()->optForMinSize())
+      Changed |= removeRedundantLEAs(LEAs);
+
     // Remove redundant address calculations.
     Changed |= removeRedundantAddrCalc(LEAs);
   }
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 58404433e1ae..274b56688558 100644
--- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -250,7 +250,8 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return CSR_64_RT_AllRegs_SaveList;
   case CallingConv::CXX_FAST_TLS:
     if (Is64Bit)
-      return CSR_64_TLS_Darwin_SaveList;
+      return MF->getInfo<X86MachineFunctionInfo>()->isSplitCSR() ?
+             CSR_64_CXX_TLS_Darwin_PE_SaveList : CSR_64_TLS_Darwin_SaveList;
     break;
   case CallingConv::Intel_OCL_BI: {
     if (HasAVX512 && IsWin64)
@@ -305,6 +306,15 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   return CSR_32_SaveList;
 }
 
+const MCPhysReg *X86RegisterInfo::getCalleeSavedRegsViaCopy(
+    const MachineFunction *MF) const {
+  assert(MF && "Invalid MachineFunction pointer.");
+  if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+      MF->getInfo<X86MachineFunctionInfo>()->isSplitCSR())
+    return CSR_64_CXX_TLS_Darwin_ViaCopy_SaveList;
+  return nullptr;
+}
+
 const uint32_t *
 X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                       CallingConv::ID CC) const {
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
index f014c8f6ff61..8d0094cbf3d6 100644
--- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -99,6 +99,8 @@ public:
   /// callee-save registers on this target.
   const MCPhysReg *
   getCalleeSavedRegs(const MachineFunction* MF) const override;
+  const MCPhysReg *
+  getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override;
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID) const override;
   const uint32_t *getNoPreservedMask() const override;